397 lines
17 KiB
Mathematica
397 lines
17 KiB
Mathematica
|
classdef DataManipulator < handle
|
||
|
%DataManipulator It manipulates and prepare data
|
||
|
% It manipulates and prepare data used to train and test our research
|
||
|
% models.
|
||
|
% It is already prepared to load and interact with mostly of the data
|
||
|
% used in our lab.
|
||
|
properties (Access = public)
|
||
|
data = []; % Whole dataset
|
||
|
nFeatures = 0; % Number of features from the dataset
|
||
|
nClasses = 0; % Number of classes from the dataset
|
||
|
|
||
|
nFoldElements = 0; % Number of elements per fold
|
||
|
nMinibatches = 0; % Number of minibatches
|
||
|
|
||
|
source = {}; % Souce data
|
||
|
target = {}; % Target data
|
||
|
end
|
||
|
properties (Access = private)
|
||
|
X = {}; % Input data
|
||
|
y = {}; % Class data
|
||
|
Xs = {}; % Source input data
|
||
|
ys = {}; % Source class data
|
||
|
Xt = {}; % Target input data
|
||
|
yt = {}; % Target class data
|
||
|
|
||
|
permutedX = {}; % Permutted Input data
|
||
|
permutedy = {}; % Permutted Class data
|
||
|
|
||
|
indexPermutation = {}; % Permuttation index (in order to know if it source or target)
|
||
|
|
||
|
dataFolderPath = '';
|
||
|
end
|
||
|
|
||
|
methods (Access = public)
|
||
|
function self = DataManipulator(dataFolderPath)
|
||
|
self.dataFolderPath = dataFolderPath;
|
||
|
end
|
||
|
|
||
|
function loadSourceCSV(self, dataset)
|
||
|
self.loadCustomCSV(join([dataset, '_source.csv']))
|
||
|
end
|
||
|
|
||
|
function loadTargetCSV(self, dataset)
|
||
|
self.loadCustomCSV(join([dataset, '_target.csv']))
|
||
|
end
|
||
|
|
||
|
function loadCustomCSV(self, filename)
|
||
|
self.data = [];
|
||
|
self.data = csvread(strcat(self.dataFolderPath, filename));
|
||
|
self.checkDatasetEven();
|
||
|
self.data = double(self.data);
|
||
|
self.nFeatures = size(self.data, 2) - 1;
|
||
|
self.nClasses = 1;
|
||
|
self.X = self.data(:,1:end-self.nClasses);
|
||
|
self.y = self.data(:,self.nFeatures+1:end);
|
||
|
self.nClasses = max(self.y);
|
||
|
|
||
|
y_one_hot = zeros(size(self.y, 1), self.nClasses);
|
||
|
for i = 1 : self.nClasses
|
||
|
rows = self.y == i;
|
||
|
y_one_hot(rows, i) = 1;
|
||
|
end
|
||
|
self.y = y_one_hot;
|
||
|
self.data = [self.X self.y];
|
||
|
end
|
||
|
|
||
|
function normalize(self)
|
||
|
%normalize
|
||
|
% Normalize every feature between 0 and 1
|
||
|
fprintf('Normalizing data\n');
|
||
|
for i = 1 : self.nFeatures
|
||
|
self.data(:, i) = (self.data(:, i) - min(self.data(:, i), [], 'all'))/max(self.data(:, i), [], 'all');
|
||
|
end
|
||
|
|
||
|
|
||
|
self.X = self.data(:, 1 : self.nFeatures);
|
||
|
self.y = self.data(:, self.nFeatures + 1 : end);
|
||
|
end
|
||
|
|
||
|
function splitAsSourceTargetStreams(self, nFoldElements, method, samplingRatio)
|
||
|
%splitAsSourceTargetStreams
|
||
|
% Split the function to simulate a Multistream classification
|
||
|
% input domains.
|
||
|
% In a Multistream classification problem, we consider that
|
||
|
% two different but related processes generate data
|
||
|
% continuously from a domain D (in this case, self.data). The
|
||
|
% first process operates in a supervised environment, i.e.,
|
||
|
% all the data instances that are generated from the first
|
||
|
% process are labeled. On the contraty, the second process
|
||
|
% generates unlabeled data from the same domain. The stream
|
||
|
% of data generated form the above processes are called the
|
||
|
% source stream and the target stream.
|
||
|
% This functions will return label for the target stream,
|
||
|
% which the user should only use for ensemble evaluation
|
||
|
% purposes
|
||
|
% nFoldElements (integer)
|
||
|
% Both source and target data will be splited in chunks
|
||
|
% of data containing n elements per chunk/fold.
|
||
|
% If you only want one chunk, pass zero or size(data,1)
|
||
|
% as argument.
|
||
|
% method (string)
|
||
|
% What kind of method will be used to generated
|
||
|
% distribute the data into source and target. Usually,
|
||
|
% Multistream Classification problems distribute the data
|
||
|
% using some bias probability.
|
||
|
% Options:
|
||
|
% 'none': Source and Target streams will be splited on
|
||
|
% half
|
||
|
% 'dallas_1: Source and Target streams will be splited
|
||
|
% on half using the bias described by paper "An
|
||
|
% adaptive framework for multistream classification"
|
||
|
% from the CS deparment of the university of Texas at
|
||
|
% Dallas
|
||
|
% 'dallas_2:' Source and Target streams will be
|
||
|
% splited on half using the bias described by paper
|
||
|
% "FUSION - An online method for multistream
|
||
|
% classification" from the university of Texas at
|
||
|
% Dallas.
|
||
|
% samplingRatio (double)
|
||
|
% Value in the interval [0.0,1.0] which describes the
|
||
|
% percentage of sampling that would go to Source Stream.
|
||
|
% Target will have 1 - n percentagem of data.
|
||
|
if nFoldElements == 0
|
||
|
self.nFoldElements = length(self.data);
|
||
|
else
|
||
|
self.nFoldElements = nFoldElements;
|
||
|
end
|
||
|
|
||
|
switch method
|
||
|
case 'none'
|
||
|
self.splitAsSourceTargetStreams_none(self.nFoldElements, samplingRatio);
|
||
|
case 'dallas_1'
|
||
|
self.splitAsSourceTargetStreams_dallas1(self.nFoldElements, samplingRatio);
|
||
|
case 'dallas_2'
|
||
|
self.splitAsSourceTargetStreams_dallas2(self.nFoldElements, samplingRatio);
|
||
|
end
|
||
|
|
||
|
self.createXsYsXtYt()
|
||
|
end
|
||
|
|
||
|
function X = getX(self, idx)
|
||
|
X = self.X(idx,:);
|
||
|
end
|
||
|
|
||
|
function y = getY(self, idx)
|
||
|
y = self.y(idx,:);
|
||
|
end
|
||
|
|
||
|
function Xs = getXs(self, nMinibatch)
|
||
|
%getXs
|
||
|
% Get the input matrix from a specific source data stream.
|
||
|
% The source stream will be only created when we are dealing
|
||
|
% with a dataset that was splitted into source and target
|
||
|
% data streams.
|
||
|
% nMinibatch (integer)
|
||
|
% The minibatch iteration
|
||
|
Xs = self.Xs{nMinibatch};
|
||
|
end
|
||
|
function ys = getYs(self, nMinibatch)
|
||
|
%getXs
|
||
|
% Get the target matrix from a specific source data stream.
|
||
|
% The source stream will be only created when we are dealing
|
||
|
% with a dataset that was splitted into source and target
|
||
|
% data streams.
|
||
|
% nMinibatch (integer)
|
||
|
% The minibatch iteration
|
||
|
ys = self.ys{nMinibatch};
|
||
|
end
|
||
|
function Xt = getXt(self, nMinibatch)
|
||
|
%getXt
|
||
|
% Get the input matrix from a specific target data stream.
|
||
|
% The target stream will be only created when we are dealing
|
||
|
% with a dataset that was splitted into source and target
|
||
|
% data streams.
|
||
|
% nMinibatch (integer)
|
||
|
% The minibatch iteration
|
||
|
Xt = self.Xt{nMinibatch};
|
||
|
end
|
||
|
function yt = getYt(self, nMinibatch)
|
||
|
%getXs
|
||
|
% Get the target matrix from a specific target data stream.
|
||
|
% The target stream will be only created when we are dealing
|
||
|
% with a dataset that was splitted into source and target
|
||
|
% data streams.
|
||
|
% nMinibatch (integer)
|
||
|
% The minibatch iteration
|
||
|
yt = self.yt{nMinibatch};
|
||
|
end
|
||
|
end
|
||
|
methods (Access = private)
|
||
|
function splitAsSourceTargetStreams_none(self, elementsPerFold, samplingRatio)
|
||
|
%splitAsSourceTargetStreams_none
|
||
|
% Split the function to simulate a Multistream classification
|
||
|
% input domains.
|
||
|
%
|
||
|
% Source and Target streams will be splited on half
|
||
|
%
|
||
|
% nFoldElements (integer)
|
||
|
% Both source and target data will be splited in chunks
|
||
|
% of data containing n elements per chunk/fold.
|
||
|
% If you only want one chunk, pass zero or size(data,1)
|
||
|
% as argument.
|
||
|
% samplingRatio (double)
|
||
|
% Value in the interval [0.0,1.0] which describes the
|
||
|
% percentage of sampling that would go to Source Stream.
|
||
|
% Target will have 1 - n percentagem of data.
|
||
|
[rowsNumber, ~] = size(self.data);
|
||
|
|
||
|
self.nFoldElements = elementsPerFold;
|
||
|
|
||
|
j = 0;
|
||
|
b = 1;
|
||
|
i = 1;
|
||
|
source = [];
|
||
|
while i < size(self.data, 1)
|
||
|
while j < self.nFoldElements && i < size(self.data, 1)
|
||
|
source = [source; self.data(i,:)];
|
||
|
j = j + 1;
|
||
|
i = i + 1;
|
||
|
end
|
||
|
self.source{b} = source;
|
||
|
self.target{b} = source;
|
||
|
source = [];
|
||
|
j = 0;
|
||
|
b = b + 1;
|
||
|
end
|
||
|
|
||
|
self.nMinibatches = b - 1;
|
||
|
end
|
||
|
function splitAsSourceTargetStreams_dallas1(self, elementsPerFold, samplingRatio)
|
||
|
%splitAsSourceTargetStreams_dallas1
|
||
|
% Split the function to simulate a Multistream classification
|
||
|
% input domains.
|
||
|
%
|
||
|
% Source and Target streams will be splited on half using the
|
||
|
% bias described by paper "An adaptive framework for
|
||
|
% multistream classification" from the CS deparment of the
|
||
|
% university of Texas at Dallas
|
||
|
%
|
||
|
% nFoldElements (integer)
|
||
|
% Both source and target data will be splited in chunks
|
||
|
% of data containing n elements per chunk/fold.
|
||
|
% If you only want one chunk, pass zero or size(data,1)
|
||
|
% as argument.
|
||
|
% samplingRatio (double)
|
||
|
% Value in the interval [0.0,1.0] which describes the
|
||
|
% percentage of sampling that would go to Source Stream.
|
||
|
% Target will have 1 - n percentagem of data.
|
||
|
[rowsNumber, ~] = size(self.data);
|
||
|
|
||
|
numberOfFolds = round(length(self.data)/elementsPerFold);
|
||
|
chunkSize = round(rowsNumber/numberOfFolds);
|
||
|
numberOfFoldsRounded = round(rowsNumber/chunkSize);
|
||
|
self.nFoldElements = min(elementsPerFold, length(self.data)/numberOfFoldsRounded);
|
||
|
|
||
|
if length(self.data)/numberOfFoldsRounded > elementsPerFold
|
||
|
numberOfFolds = numberOfFolds + 1;
|
||
|
end
|
||
|
self.nMinibatches = numberOfFolds;
|
||
|
ck = self.nFoldElements;
|
||
|
|
||
|
for i = 1:numberOfFolds
|
||
|
x = [];
|
||
|
data = [];
|
||
|
if i > numberOfFoldsRounded
|
||
|
x = self.data(ck * (i-1) + 1:end,1:end-self.nClasses);
|
||
|
data = self.data(ck * (i-1) + 1:end,1:end);
|
||
|
else
|
||
|
x = self.data(ck * (i-1) + 1:ck * i,1:end-self.nClasses);
|
||
|
data = self.data(ck * (i-1) + 1:ck * i,1:end);
|
||
|
end
|
||
|
|
||
|
x_mean = mean(x);
|
||
|
probability = exp(-abs(x - x_mean).^2);
|
||
|
[~,idx] = sort(probability);
|
||
|
|
||
|
m = size(data,1);
|
||
|
source = data(idx(1:ceil(m*samplingRatio)),1:end);
|
||
|
target = data(idx(ceil(m*samplingRatio)+1:length(data)),1:end);
|
||
|
|
||
|
self.source{i} = source;
|
||
|
self.target{i} = target;
|
||
|
end
|
||
|
end
|
||
|
function splitAsSourceTargetStreams_dallas2(self, elementsPerFold, samplingRatio)
|
||
|
%splitAsSourceTargetStreams_dallas2
|
||
|
% Split the function to simulate a Multistream classification
|
||
|
% input domains.
|
||
|
%
|
||
|
% Source and Target streams will be splited on half using the
|
||
|
% bias described by paper "FUSION - An online method for
|
||
|
% multistream classification" from the university of Texas at
|
||
|
% Dallas.
|
||
|
%
|
||
|
% nFoldElements (integer)
|
||
|
% Both source and target data will be splited in chunks
|
||
|
% of data containing n elements per chunk/fold.
|
||
|
% If you only want one chunk, pass zero or size(data,1)
|
||
|
% as argument.
|
||
|
% samplingRatio (double)
|
||
|
% Value in the interval [0.0,1.0] which describes the
|
||
|
% percentage of sampling that would go to Source Stream.
|
||
|
% Target will have 1 - n percentagem of data.
|
||
|
|
||
|
[rowsNumber, ~] = size(self.data);
|
||
|
|
||
|
numberOfFolds = round(length(self.data)/elementsPerFold);
|
||
|
chunkSize = round(rowsNumber/numberOfFolds);
|
||
|
numberOfFoldsRounded = round(rowsNumber/chunkSize);
|
||
|
if mod(floor(size(self.data, 1)/numberOfFoldsRounded), 2) == 0
|
||
|
self.nFoldElements = min(elementsPerFold, floor(size(self.data, 1)/numberOfFoldsRounded));
|
||
|
else
|
||
|
self.nFoldElements = min(elementsPerFold, floor(size(self.data, 1)/numberOfFoldsRounded) - 1);
|
||
|
end
|
||
|
|
||
|
|
||
|
if length(self.data)/numberOfFoldsRounded > elementsPerFold
|
||
|
numberOfFolds = numberOfFolds + 1;
|
||
|
end
|
||
|
self.nMinibatches = numberOfFolds;
|
||
|
ck = self.nFoldElements;
|
||
|
|
||
|
for i = 1 : numberOfFolds
|
||
|
x = [];
|
||
|
data = [];
|
||
|
if i > numberOfFoldsRounded
|
||
|
x = self.data(ck * (i-1) + 1:end,1:end-self.nClasses);
|
||
|
data = self.data(ck * (i-1) + 1:end,1:end);
|
||
|
else
|
||
|
x = self.data(ck * (i-1) + 1:ck * i,1:end-self.nClasses);
|
||
|
data = self.data(ck * (i-1) + 1:ck * i,1:end);
|
||
|
end
|
||
|
|
||
|
x_mean = mean(x);
|
||
|
norm_1 = vecnorm((x - x_mean)',1)';
|
||
|
norm_2 = vecnorm((x - x_mean)',2)';
|
||
|
numerator = norm_2;
|
||
|
denominator = 2 * std(norm_1) ^ 2;
|
||
|
probability = exp(-numerator/denominator);
|
||
|
[~,idx] = sort(probability);
|
||
|
|
||
|
m = size(data,1);
|
||
|
source = data(idx(1 : ceil(m * samplingRatio)), 1 : end);
|
||
|
target = data(idx(ceil(m * samplingRatio) + 1: size(data, 1)), 1 : end);
|
||
|
|
||
|
self.source{i} = source;
|
||
|
self.target{i} = target;
|
||
|
end
|
||
|
end
|
||
|
|
||
|
function createXsYsXtYt(self)
|
||
|
%createXsYsXtYt
|
||
|
% Split the datastream data into sets of input, output, input
|
||
|
% from source, output from source, input from target, output
|
||
|
% from target
|
||
|
% It also creates a permutted version of this data, in
|
||
|
self.X = {};
|
||
|
self.y = {};
|
||
|
self.Xs = {};
|
||
|
self.ys = {};
|
||
|
self.Xt = {};
|
||
|
self.yt = {};
|
||
|
self.permutedX = {};
|
||
|
self.permutedy = {};
|
||
|
for i = 1 : self.nMinibatches
|
||
|
self.Xs{i} = self.source{i}(:,1:end-self.nClasses);
|
||
|
self.ys{i} = self.source{i}(:,self.nFeatures+1:end);
|
||
|
self.Xt{i} = self.target{i}(:,1:end-self.nClasses);
|
||
|
self.yt{i} = self.target{i}(:,self.nFeatures+1:end);
|
||
|
self.X{i} = [self.Xs{i};self.Xt{i}];
|
||
|
self.y{i} = [self.ys{i};self.yt{i}];
|
||
|
|
||
|
x = self.X{i};
|
||
|
Y = self.y{i};
|
||
|
|
||
|
p = randperm(size(x, 1));
|
||
|
self.permutedX{i} = x(p,:);
|
||
|
self.permutedy{i} = Y(p,:);
|
||
|
self.indexPermutation{i} = p;
|
||
|
end
|
||
|
end
|
||
|
|
||
|
function checkDatasetEven(self)
|
||
|
%checkDatasetEven
|
||
|
% Check if the number of rows in the whole dataset is even,
|
||
|
% so we can split in a equal number of elements for source
|
||
|
% and stream (when splitting by 0.5 ratio)
|
||
|
% If the number is odd, randomly trow a row away.
|
||
|
if mod(length(self.data),2) ~= 0
|
||
|
p = ceil(rand() * length(self.data));
|
||
|
self.data = [self.data(1:p-1,:);self.data(p+1:end,:)];
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
end
|
||
|
|