ACDC_KNOSYS-2021/ATL/DataManipulator.m

397 lines
17 KiB
Mathematica
Raw Normal View History

2021-10-04 18:31:00 +08:00
classdef DataManipulator < handle
%DataManipulator It manipulates and prepare data
% It manipulates and prepare data used to train and test our research
% models.
% It is already prepared to load and interact with mostly of the data
% used in our lab.
properties (Access = public)
data = []; % Whole dataset
nFeatures = 0; % Number of features from the dataset
nClasses = 0; % Number of classes from the dataset
nFoldElements = 0; % Number of elements per fold
nMinibatches = 0; % Number of minibatches
source = {}; % Souce data
target = {}; % Target data
end
properties (Access = private)
X = {}; % Input data
y = {}; % Class data
Xs = {}; % Source input data
ys = {}; % Source class data
Xt = {}; % Target input data
yt = {}; % Target class data
permutedX = {}; % Permutted Input data
permutedy = {}; % Permutted Class data
indexPermutation = {}; % Permuttation index (in order to know if it source or target)
dataFolderPath = '';
end
methods (Access = public)
function self = DataManipulator(dataFolderPath)
self.dataFolderPath = dataFolderPath;
end
function loadSourceCSV(self, dataset)
self.loadCustomCSV(join([dataset, '_source.csv']))
end
function loadTargetCSV(self, dataset)
self.loadCustomCSV(join([dataset, '_target.csv']))
end
function loadCustomCSV(self, filename)
self.data = [];
self.data = csvread(strcat(self.dataFolderPath, filename));
self.checkDatasetEven();
self.data = double(self.data);
self.nFeatures = size(self.data, 2) - 1;
self.nClasses = 1;
self.X = self.data(:,1:end-self.nClasses);
self.y = self.data(:,self.nFeatures+1:end);
self.nClasses = max(self.y);
y_one_hot = zeros(size(self.y, 1), self.nClasses);
for i = 1 : self.nClasses
rows = self.y == i;
y_one_hot(rows, i) = 1;
end
self.y = y_one_hot;
self.data = [self.X self.y];
end
function normalize(self)
%normalize
% Normalize every feature between 0 and 1
fprintf('Normalizing data\n');
for i = 1 : self.nFeatures
self.data(:, i) = (self.data(:, i) - min(self.data(:, i), [], 'all'))/max(self.data(:, i), [], 'all');
end
self.X = self.data(:, 1 : self.nFeatures);
self.y = self.data(:, self.nFeatures + 1 : end);
end
function splitAsSourceTargetStreams(self, nFoldElements, method, samplingRatio)
%splitAsSourceTargetStreams
% Split the function to simulate a Multistream classification
% input domains.
% In a Multistream classification problem, we consider that
% two different but related processes generate data
% continuously from a domain D (in this case, self.data). The
% first process operates in a supervised environment, i.e.,
% all the data instances that are generated from the first
% process are labeled. On the contraty, the second process
% generates unlabeled data from the same domain. The stream
% of data generated form the above processes are called the
% source stream and the target stream.
% This functions will return label for the target stream,
% which the user should only use for ensemble evaluation
% purposes
% nFoldElements (integer)
% Both source and target data will be splited in chunks
% of data containing n elements per chunk/fold.
% If you only want one chunk, pass zero or size(data,1)
% as argument.
% method (string)
% What kind of method will be used to generated
% distribute the data into source and target. Usually,
% Multistream Classification problems distribute the data
% using some bias probability.
% Options:
% 'none': Source and Target streams will be splited on
% half
% 'dallas_1: Source and Target streams will be splited
% on half using the bias described by paper "An
% adaptive framework for multistream classification"
% from the CS deparment of the university of Texas at
% Dallas
% 'dallas_2:' Source and Target streams will be
% splited on half using the bias described by paper
% "FUSION - An online method for multistream
% classification" from the university of Texas at
% Dallas.
% samplingRatio (double)
% Value in the interval [0.0,1.0] which describes the
% percentage of sampling that would go to Source Stream.
% Target will have 1 - n percentagem of data.
if nFoldElements == 0
self.nFoldElements = length(self.data);
else
self.nFoldElements = nFoldElements;
end
switch method
case 'none'
self.splitAsSourceTargetStreams_none(self.nFoldElements, samplingRatio);
case 'dallas_1'
self.splitAsSourceTargetStreams_dallas1(self.nFoldElements, samplingRatio);
case 'dallas_2'
self.splitAsSourceTargetStreams_dallas2(self.nFoldElements, samplingRatio);
end
self.createXsYsXtYt()
end
function X = getX(self, idx)
X = self.X(idx,:);
end
function y = getY(self, idx)
y = self.y(idx,:);
end
function Xs = getXs(self, nMinibatch)
%getXs
% Get the input matrix from a specific source data stream.
% The source stream will be only created when we are dealing
% with a dataset that was splitted into source and target
% data streams.
% nMinibatch (integer)
% The minibatch iteration
Xs = self.Xs{nMinibatch};
end
function ys = getYs(self, nMinibatch)
%getXs
% Get the target matrix from a specific source data stream.
% The source stream will be only created when we are dealing
% with a dataset that was splitted into source and target
% data streams.
% nMinibatch (integer)
% The minibatch iteration
ys = self.ys{nMinibatch};
end
function Xt = getXt(self, nMinibatch)
%getXt
% Get the input matrix from a specific target data stream.
% The target stream will be only created when we are dealing
% with a dataset that was splitted into source and target
% data streams.
% nMinibatch (integer)
% The minibatch iteration
Xt = self.Xt{nMinibatch};
end
function yt = getYt(self, nMinibatch)
%getXs
% Get the target matrix from a specific target data stream.
% The target stream will be only created when we are dealing
% with a dataset that was splitted into source and target
% data streams.
% nMinibatch (integer)
% The minibatch iteration
yt = self.yt{nMinibatch};
end
end
methods (Access = private)
function splitAsSourceTargetStreams_none(self, elementsPerFold, samplingRatio)
%splitAsSourceTargetStreams_none
% Split the function to simulate a Multistream classification
% input domains.
%
% Source and Target streams will be splited on half
%
% nFoldElements (integer)
% Both source and target data will be splited in chunks
% of data containing n elements per chunk/fold.
% If you only want one chunk, pass zero or size(data,1)
% as argument.
% samplingRatio (double)
% Value in the interval [0.0,1.0] which describes the
% percentage of sampling that would go to Source Stream.
% Target will have 1 - n percentagem of data.
[rowsNumber, ~] = size(self.data);
self.nFoldElements = elementsPerFold;
j = 0;
b = 1;
i = 1;
source = [];
while i < size(self.data, 1)
while j < self.nFoldElements && i < size(self.data, 1)
source = [source; self.data(i,:)];
j = j + 1;
i = i + 1;
end
self.source{b} = source;
self.target{b} = source;
source = [];
j = 0;
b = b + 1;
end
self.nMinibatches = b - 1;
end
function splitAsSourceTargetStreams_dallas1(self, elementsPerFold, samplingRatio)
%splitAsSourceTargetStreams_dallas1
% Split the function to simulate a Multistream classification
% input domains.
%
% Source and Target streams will be splited on half using the
% bias described by paper "An adaptive framework for
% multistream classification" from the CS deparment of the
% university of Texas at Dallas
%
% nFoldElements (integer)
% Both source and target data will be splited in chunks
% of data containing n elements per chunk/fold.
% If you only want one chunk, pass zero or size(data,1)
% as argument.
% samplingRatio (double)
% Value in the interval [0.0,1.0] which describes the
% percentage of sampling that would go to Source Stream.
% Target will have 1 - n percentagem of data.
[rowsNumber, ~] = size(self.data);
numberOfFolds = round(length(self.data)/elementsPerFold);
chunkSize = round(rowsNumber/numberOfFolds);
numberOfFoldsRounded = round(rowsNumber/chunkSize);
self.nFoldElements = min(elementsPerFold, length(self.data)/numberOfFoldsRounded);
if length(self.data)/numberOfFoldsRounded > elementsPerFold
numberOfFolds = numberOfFolds + 1;
end
self.nMinibatches = numberOfFolds;
ck = self.nFoldElements;
for i = 1:numberOfFolds
x = [];
data = [];
if i > numberOfFoldsRounded
x = self.data(ck * (i-1) + 1:end,1:end-self.nClasses);
data = self.data(ck * (i-1) + 1:end,1:end);
else
x = self.data(ck * (i-1) + 1:ck * i,1:end-self.nClasses);
data = self.data(ck * (i-1) + 1:ck * i,1:end);
end
x_mean = mean(x);
probability = exp(-abs(x - x_mean).^2);
[~,idx] = sort(probability);
m = size(data,1);
source = data(idx(1:ceil(m*samplingRatio)),1:end);
target = data(idx(ceil(m*samplingRatio)+1:length(data)),1:end);
self.source{i} = source;
self.target{i} = target;
end
end
function splitAsSourceTargetStreams_dallas2(self, elementsPerFold, samplingRatio)
%splitAsSourceTargetStreams_dallas2
% Split the function to simulate a Multistream classification
% input domains.
%
% Source and Target streams will be splited on half using the
% bias described by paper "FUSION - An online method for
% multistream classification" from the university of Texas at
% Dallas.
%
% nFoldElements (integer)
% Both source and target data will be splited in chunks
% of data containing n elements per chunk/fold.
% If you only want one chunk, pass zero or size(data,1)
% as argument.
% samplingRatio (double)
% Value in the interval [0.0,1.0] which describes the
% percentage of sampling that would go to Source Stream.
% Target will have 1 - n percentagem of data.
[rowsNumber, ~] = size(self.data);
numberOfFolds = round(length(self.data)/elementsPerFold);
chunkSize = round(rowsNumber/numberOfFolds);
numberOfFoldsRounded = round(rowsNumber/chunkSize);
if mod(floor(size(self.data, 1)/numberOfFoldsRounded), 2) == 0
self.nFoldElements = min(elementsPerFold, floor(size(self.data, 1)/numberOfFoldsRounded));
else
self.nFoldElements = min(elementsPerFold, floor(size(self.data, 1)/numberOfFoldsRounded) - 1);
end
if length(self.data)/numberOfFoldsRounded > elementsPerFold
numberOfFolds = numberOfFolds + 1;
end
self.nMinibatches = numberOfFolds;
ck = self.nFoldElements;
for i = 1 : numberOfFolds
x = [];
data = [];
if i > numberOfFoldsRounded
x = self.data(ck * (i-1) + 1:end,1:end-self.nClasses);
data = self.data(ck * (i-1) + 1:end,1:end);
else
x = self.data(ck * (i-1) + 1:ck * i,1:end-self.nClasses);
data = self.data(ck * (i-1) + 1:ck * i,1:end);
end
x_mean = mean(x);
norm_1 = vecnorm((x - x_mean)',1)';
norm_2 = vecnorm((x - x_mean)',2)';
numerator = norm_2;
denominator = 2 * std(norm_1) ^ 2;
probability = exp(-numerator/denominator);
[~,idx] = sort(probability);
m = size(data,1);
source = data(idx(1 : ceil(m * samplingRatio)), 1 : end);
target = data(idx(ceil(m * samplingRatio) + 1: size(data, 1)), 1 : end);
self.source{i} = source;
self.target{i} = target;
end
end
function createXsYsXtYt(self)
%createXsYsXtYt
% Split the datastream data into sets of input, output, input
% from source, output from source, input from target, output
% from target
% It also creates a permutted version of this data, in
self.X = {};
self.y = {};
self.Xs = {};
self.ys = {};
self.Xt = {};
self.yt = {};
self.permutedX = {};
self.permutedy = {};
for i = 1 : self.nMinibatches
self.Xs{i} = self.source{i}(:,1:end-self.nClasses);
self.ys{i} = self.source{i}(:,self.nFeatures+1:end);
self.Xt{i} = self.target{i}(:,1:end-self.nClasses);
self.yt{i} = self.target{i}(:,self.nFeatures+1:end);
self.X{i} = [self.Xs{i};self.Xt{i}];
self.y{i} = [self.ys{i};self.yt{i}];
x = self.X{i};
Y = self.y{i};
p = randperm(size(x, 1));
self.permutedX{i} = x(p,:);
self.permutedy{i} = Y(p,:);
self.indexPermutation{i} = p;
end
end
function checkDatasetEven(self)
%checkDatasetEven
% Check if the number of rows in the whole dataset is even,
% so we can split in a equal number of elements for source
% and stream (when splitting by 0.5 ratio)
% If the number is odd, randomly trow a row away.
if mod(length(self.data),2) ~= 0
p = ceil(rand() * length(self.data));
self.data = [self.data(1:p-1,:);self.data(p+1:end,:)];
end
end
end
end