classdef DataManipulator < handle %DataManipulator It manipulates and prepare data % It manipulates and prepare data used to train and test our research % models. % It is already prepared to load and interact with mostly of the data % used in our lab. properties (Access = public) data = []; % Whole dataset nFeatures = 0; % Number of features from the dataset nClasses = 0; % Number of classes from the dataset nFoldElements = 0; % Number of elements per fold nMinibatches = 0; % Number of minibatches source = {}; % Souce data target = {}; % Target data end properties (Access = private) X = {}; % Input data y = {}; % Class data Xs = {}; % Source input data ys = {}; % Source class data Xt = {}; % Target input data yt = {}; % Target class data permutedX = {}; % Permutted Input data permutedy = {}; % Permutted Class data indexPermutation = {}; % Permuttation index (in order to know if it source or target) dataFolderPath = ''; end methods (Access = public) function self = DataManipulator(dataFolderPath) self.dataFolderPath = dataFolderPath; end function loadSourceCSV(self, dataset) self.loadCustomCSV(join([dataset, '_source.csv'])) end function loadTargetCSV(self, dataset) self.loadCustomCSV(join([dataset, '_target.csv'])) end function loadCustomCSV(self, filename) self.data = []; self.data = csvread(strcat(self.dataFolderPath, filename)); self.checkDatasetEven(); self.data = double(self.data); self.nFeatures = size(self.data, 2) - 1; self.nClasses = 1; self.X = self.data(:,1:end-self.nClasses); self.y = self.data(:,self.nFeatures+1:end); self.nClasses = max(self.y); y_one_hot = zeros(size(self.y, 1), self.nClasses); for i = 1 : self.nClasses rows = self.y == i; y_one_hot(rows, i) = 1; end self.y = y_one_hot; self.data = [self.X self.y]; end function normalize(self) %normalize % Normalize every feature between 0 and 1 fprintf('Normalizing data\n'); for i = 1 : self.nFeatures self.data(:, i) = (self.data(:, i) - min(self.data(:, i), [], 'all'))/max(self.data(:, i), [], 'all'); end self.X = self.data(:, 1 : self.nFeatures); self.y = self.data(:, self.nFeatures + 1 : end); end function splitAsSourceTargetStreams(self, nFoldElements, method, samplingRatio) %splitAsSourceTargetStreams % Split the function to simulate a Multistream classification % input domains. % In a Multistream classification problem, we consider that % two different but related processes generate data % continuously from a domain D (in this case, self.data). The % first process operates in a supervised environment, i.e., % all the data instances that are generated from the first % process are labeled. On the contraty, the second process % generates unlabeled data from the same domain. The stream % of data generated form the above processes are called the % source stream and the target stream. % This functions will return label for the target stream, % which the user should only use for ensemble evaluation % purposes % nFoldElements (integer) % Both source and target data will be splited in chunks % of data containing n elements per chunk/fold. % If you only want one chunk, pass zero or size(data,1) % as argument. % method (string) % What kind of method will be used to generated % distribute the data into source and target. Usually, % Multistream Classification problems distribute the data % using some bias probability. % Options: % 'none': Source and Target streams will be splited on % half % 'dallas_1: Source and Target streams will be splited % on half using the bias described by paper "An % adaptive framework for multistream classification" % from the CS deparment of the university of Texas at % Dallas % 'dallas_2:' Source and Target streams will be % splited on half using the bias described by paper % "FUSION - An online method for multistream % classification" from the university of Texas at % Dallas. % samplingRatio (double) % Value in the interval [0.0,1.0] which describes the % percentage of sampling that would go to Source Stream. % Target will have 1 - n percentagem of data. if nFoldElements == 0 self.nFoldElements = length(self.data); else self.nFoldElements = nFoldElements; end switch method case 'none' self.splitAsSourceTargetStreams_none(self.nFoldElements, samplingRatio); case 'dallas_1' self.splitAsSourceTargetStreams_dallas1(self.nFoldElements, samplingRatio); case 'dallas_2' self.splitAsSourceTargetStreams_dallas2(self.nFoldElements, samplingRatio); end self.createXsYsXtYt() end function X = getX(self, idx) X = self.X(idx,:); end function y = getY(self, idx) y = self.y(idx,:); end function Xs = getXs(self, nMinibatch) %getXs % Get the input matrix from a specific source data stream. % The source stream will be only created when we are dealing % with a dataset that was splitted into source and target % data streams. % nMinibatch (integer) % The minibatch iteration Xs = self.Xs{nMinibatch}; end function ys = getYs(self, nMinibatch) %getXs % Get the target matrix from a specific source data stream. % The source stream will be only created when we are dealing % with a dataset that was splitted into source and target % data streams. % nMinibatch (integer) % The minibatch iteration ys = self.ys{nMinibatch}; end function Xt = getXt(self, nMinibatch) %getXt % Get the input matrix from a specific target data stream. % The target stream will be only created when we are dealing % with a dataset that was splitted into source and target % data streams. % nMinibatch (integer) % The minibatch iteration Xt = self.Xt{nMinibatch}; end function yt = getYt(self, nMinibatch) %getXs % Get the target matrix from a specific target data stream. % The target stream will be only created when we are dealing % with a dataset that was splitted into source and target % data streams. % nMinibatch (integer) % The minibatch iteration yt = self.yt{nMinibatch}; end end methods (Access = private) function splitAsSourceTargetStreams_none(self, elementsPerFold, samplingRatio) %splitAsSourceTargetStreams_none % Split the function to simulate a Multistream classification % input domains. % % Source and Target streams will be splited on half % % nFoldElements (integer) % Both source and target data will be splited in chunks % of data containing n elements per chunk/fold. % If you only want one chunk, pass zero or size(data,1) % as argument. % samplingRatio (double) % Value in the interval [0.0,1.0] which describes the % percentage of sampling that would go to Source Stream. % Target will have 1 - n percentagem of data. [rowsNumber, ~] = size(self.data); self.nFoldElements = elementsPerFold; j = 0; b = 1; i = 1; source = []; while i < size(self.data, 1) while j < self.nFoldElements && i < size(self.data, 1) source = [source; self.data(i,:)]; j = j + 1; i = i + 1; end self.source{b} = source; self.target{b} = source; source = []; j = 0; b = b + 1; end self.nMinibatches = b - 1; end function splitAsSourceTargetStreams_dallas1(self, elementsPerFold, samplingRatio) %splitAsSourceTargetStreams_dallas1 % Split the function to simulate a Multistream classification % input domains. % % Source and Target streams will be splited on half using the % bias described by paper "An adaptive framework for % multistream classification" from the CS deparment of the % university of Texas at Dallas % % nFoldElements (integer) % Both source and target data will be splited in chunks % of data containing n elements per chunk/fold. % If you only want one chunk, pass zero or size(data,1) % as argument. % samplingRatio (double) % Value in the interval [0.0,1.0] which describes the % percentage of sampling that would go to Source Stream. % Target will have 1 - n percentagem of data. [rowsNumber, ~] = size(self.data); numberOfFolds = round(length(self.data)/elementsPerFold); chunkSize = round(rowsNumber/numberOfFolds); numberOfFoldsRounded = round(rowsNumber/chunkSize); self.nFoldElements = min(elementsPerFold, length(self.data)/numberOfFoldsRounded); if length(self.data)/numberOfFoldsRounded > elementsPerFold numberOfFolds = numberOfFolds + 1; end self.nMinibatches = numberOfFolds; ck = self.nFoldElements; for i = 1:numberOfFolds x = []; data = []; if i > numberOfFoldsRounded x = self.data(ck * (i-1) + 1:end,1:end-self.nClasses); data = self.data(ck * (i-1) + 1:end,1:end); else x = self.data(ck * (i-1) + 1:ck * i,1:end-self.nClasses); data = self.data(ck * (i-1) + 1:ck * i,1:end); end x_mean = mean(x); probability = exp(-abs(x - x_mean).^2); [~,idx] = sort(probability); m = size(data,1); source = data(idx(1:ceil(m*samplingRatio)),1:end); target = data(idx(ceil(m*samplingRatio)+1:length(data)),1:end); self.source{i} = source; self.target{i} = target; end end function splitAsSourceTargetStreams_dallas2(self, elementsPerFold, samplingRatio) %splitAsSourceTargetStreams_dallas2 % Split the function to simulate a Multistream classification % input domains. % % Source and Target streams will be splited on half using the % bias described by paper "FUSION - An online method for % multistream classification" from the university of Texas at % Dallas. % % nFoldElements (integer) % Both source and target data will be splited in chunks % of data containing n elements per chunk/fold. % If you only want one chunk, pass zero or size(data,1) % as argument. % samplingRatio (double) % Value in the interval [0.0,1.0] which describes the % percentage of sampling that would go to Source Stream. % Target will have 1 - n percentagem of data. [rowsNumber, ~] = size(self.data); numberOfFolds = round(length(self.data)/elementsPerFold); chunkSize = round(rowsNumber/numberOfFolds); numberOfFoldsRounded = round(rowsNumber/chunkSize); if mod(floor(size(self.data, 1)/numberOfFoldsRounded), 2) == 0 self.nFoldElements = min(elementsPerFold, floor(size(self.data, 1)/numberOfFoldsRounded)); else self.nFoldElements = min(elementsPerFold, floor(size(self.data, 1)/numberOfFoldsRounded) - 1); end if length(self.data)/numberOfFoldsRounded > elementsPerFold numberOfFolds = numberOfFolds + 1; end self.nMinibatches = numberOfFolds; ck = self.nFoldElements; for i = 1 : numberOfFolds x = []; data = []; if i > numberOfFoldsRounded x = self.data(ck * (i-1) + 1:end,1:end-self.nClasses); data = self.data(ck * (i-1) + 1:end,1:end); else x = self.data(ck * (i-1) + 1:ck * i,1:end-self.nClasses); data = self.data(ck * (i-1) + 1:ck * i,1:end); end x_mean = mean(x); norm_1 = vecnorm((x - x_mean)',1)'; norm_2 = vecnorm((x - x_mean)',2)'; numerator = norm_2; denominator = 2 * std(norm_1) ^ 2; probability = exp(-numerator/denominator); [~,idx] = sort(probability); m = size(data,1); source = data(idx(1 : ceil(m * samplingRatio)), 1 : end); target = data(idx(ceil(m * samplingRatio) + 1: size(data, 1)), 1 : end); self.source{i} = source; self.target{i} = target; end end function createXsYsXtYt(self) %createXsYsXtYt % Split the datastream data into sets of input, output, input % from source, output from source, input from target, output % from target % It also creates a permutted version of this data, in self.X = {}; self.y = {}; self.Xs = {}; self.ys = {}; self.Xt = {}; self.yt = {}; self.permutedX = {}; self.permutedy = {}; for i = 1 : self.nMinibatches self.Xs{i} = self.source{i}(:,1:end-self.nClasses); self.ys{i} = self.source{i}(:,self.nFeatures+1:end); self.Xt{i} = self.target{i}(:,1:end-self.nClasses); self.yt{i} = self.target{i}(:,self.nFeatures+1:end); self.X{i} = [self.Xs{i};self.Xt{i}]; self.y{i} = [self.ys{i};self.yt{i}]; x = self.X{i}; Y = self.y{i}; p = randperm(size(x, 1)); self.permutedX{i} = x(p,:); self.permutedy{i} = Y(p,:); self.indexPermutation{i} = p; end end function checkDatasetEven(self) %checkDatasetEven % Check if the number of rows in the whole dataset is even, % so we can split in a equal number of elements for source % and stream (when splitting by 0.5 ratio) % If the number is odd, randomly trow a row away. if mod(length(self.data),2) ~= 0 p = ceil(rand() * length(self.data)); self.data = [self.data(1:p-1,:);self.data(p+1:end,:)]; end end end end