Add files via upload

This commit is contained in:
Marcus Vinicius de Carvalho 2021-10-04 18:31:00 +08:00 committed by GitHub
parent 128e7ddda7
commit 7e1832d7cf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
49 changed files with 5060 additions and 0 deletions

241
ATL/AGMM.m Normal file
View File

@ -0,0 +1,241 @@
classdef AGMM < handle
properties (Access = public)
gmmArray = [];
nSamplesFeed = 0;
rho = 0.1;
nFeatures;
end
methods (Access = public)
function run(self, x, bias2)
self.nSamplesFeed = self.nSamplesFeed + 1;
if size(self.gmmArray, 1) == 0
self.gmmArray = [self.gmmArray; GMM(x)];
self.nFeatures = size(x, 2);
else
self.computeInference(x);
[~, gmmWinnerIdx] = max(self.updateWeights());
if self.M() > 1
self.computeOverlapsDegree(gmmWinnerIdx, 3, 3);
end
denominator = 1.25 * exp(-bias2) + 0.75 * self.nFeatures;
numerator = 4 - 2 * exp( -self.nFeatures / 2);
threshold = exp(- denominator / numerator);
if self.gmmArray(gmmWinnerIdx).inference < threshold ...
&& self.gmmArray(gmmWinnerIdx).hyperVolume > self.rho * (self.computeSumHyperVolume() - self.gmmArray(gmmWinnerIdx).hyperVolume)...
&& self.nSamplesFeed > 10
% Create a new cluster
self.createCluster(x);
self.gmmArray(end).var = (x - self.gmmArray(gmmWinnerIdx).center) .^ 2;
else
% Update the winning cluster
self.updateCluster(x, self.gmmArray(gmmWinnerIdx));
end
end
end
function createCluster(self, x)
self.gmmArray = [self.gmmArray; GMM(x)];
weightSum = 0;
for i = 1 : size(self.gmmArray, 1)
weightSum = weightSum + self.gmmArray(i).weight;
end
for i = 1 : size(self.gmmArray, 1)
self.gmmArray(i).weight = self.gmmArray(i).weight/weightSum;
end
end
function updateCluster(~, x, gmm)
gmm.winCounter = gmm.winCounter + 1;
gmm.center = gmm.center + (x - gmm.center) / gmm.winCounter;
gmm.var = gmm.var + ((x - gmm.center) .^ 2 - gmm.var) / gmm.winCounter;
end
function deleteCluster(self)
accu_e = zeros(1, size(self.gmmArray, 1));
for i = 1 : size(self.gmmArray, 1)
accu_e(i) = self.gmmArray(i).inferenceSum / self.gmmArray(i).surviveCounter;
end
accu_e(isnan(accu_e)) = [];
deleteList = find(accu_e <= mean(accu_e) - 0.5 * std(accu_e));
if ~isempty(deleteList)
self.gmmArray(deleteList) = [];
accu_e(deleteList) = [];
end
sumWeight = 0;
for i = 1 : size(self.gmmArray, 1)
sumWeight = sumWeight + self.gmmArray(i).weight;
end
if sumWeight == 0
[~, maxIdx] = max(accu_e);
self.gmmArray(maxIdx).weight = self.gmmArray(i).weight + 1;
end
sumWeight = 0;
for i = 1 : size(self.gmmArray, 1)
sumWeight = sumWeight + self.gmmArray(i).weight;
end
for i = 1 : size(self.gmmArray, 1)
self.gmmArray(i).weight = self.gmmArray(i).weight / sumWeight;
end
end
function hyperVolume = computeSumHyperVolume(self)
hyperVolume = 0;
for i = 1 : size(self.gmmArray, 1)
hyperVolume = hyperVolume + self.gmmArray(i).hyperVolume;
end
end
function computeInference(self, x, y)
for i = 1 : size(self.gmmArray, 1)
gmm = self.gmmArray(i);
if nargin == 3
gmm.computeInference(x, y);
else
gmm.computeInference(x);
end
end
end
function weights = updateWeights(self)
denumerator = zeros(1, size(self.gmmArray, 1));
probX_J = zeros(1, size(self.gmmArray, 1));
probJ = zeros(1, size(self.gmmArray, 1));
probX_JprobJ = zeros(1, size(self.gmmArray, 1));
weights = zeros(1, size(self.gmmArray, 1));
sumWinCounter = 0;
maxInference = 0;
maxInferenceIdx = 1;
for i = 1 : size(self.gmmArray, 1)
sumWinCounter = sumWinCounter + self.gmmArray(i).winCounter;
if self.gmmArray(i).inference > maxInference
maxInference = self.gmmArray(i).inference;
maxInferenceIdx = i;
end
end
for i = 1 : size(self.gmmArray, 1)
self.gmmArray(i).inferenceSum = self.gmmArray(i).inferenceSum + self.gmmArray(i).inference;
self.gmmArray(i).surviveCounter = self.gmmArray(i).surviveCounter + 1;
denumerator(i) = sqrt(2 * pi * self.gmmArray(i).hyperVolume);
probX_J(i) = denumerator(i) .* self.gmmArray(i).inference;
probJ(i) = self.gmmArray(i).winCounter / sumWinCounter;
probX_JprobJ(i) = probX_J(i) * probJ(i);
end
if sum(probX_JprobJ) == 0
probX_JprobJ(maxInferenceIdx) = probX_JprobJ(maxInferenceIdx) + 1;
end
for i = 1 : size(self.gmmArray, 1)
self.gmmArray(i).weight = probX_JprobJ(i) / sum(probX_JprobJ);
weights(i) = self.gmmArray(i).weight;
end
end
function computeOverlapsDegree(self, gmmWinnerIdx, maximumLimit, minimumLimit)
if nargin == 2
maximumLimit = 3;
minimumLimit = maximumLimit;
elseif nargin == 3
minimumLimit = maximumLimit;
end
maximumLimit = abs(maximumLimit);
minimumLimit = abs(minimumLimit);
nGMM = size(self.gmmArray, 1);
overlap_coefficient = 1/(nGMM-1);
sigmaMaximumWinner = maximumLimit * sqrt(self.gmmArray(gmmWinnerIdx).var);
sigmaMinimumWinner = minimumLimit * sqrt(self.gmmArray(gmmWinnerIdx).var);
if maximumLimit == minimumLimit
miu_plus_sigma_winner = self.gmmArray(gmmWinnerIdx).center + sigmaMaximumWinner;
miu_mins_sigma_winner = self.gmmArray(gmmWinnerIdx).center - sigmaMinimumWinner;
else
miu_plus_sigma_winner = sigmaMinimumWinner + sigmaMaximumWinner;
miu_mins_sigma_winner = -sigmaMinimumWinner -sigmaMaximumWinner;
end
miu_plus_sigma = zeros(nGMM, self.nFeatures);
miu_mins_sigma = zeros(nGMM, self.nFeatures);
overlap_mins_mins = zeros(1, nGMM);
overlap_mins_plus = zeros(1, nGMM);
overlap_plus_mins = zeros(1, nGMM);
overlap_plus_plus = zeros(1, nGMM);
overlap_score = zeros(1, nGMM);
for i = 1 : nGMM
sigmaMaximum = maximumLimit * sqrt(self.gmmArray(i).var);
sigmaMinimum = minimumLimit * sqrt(self.gmmArray(i).var);
if maximumLimit == minimumLimit
miu_plus_sigma(i, :) = self.gmmArray(i).center + sigmaMaximum;
miu_mins_sigma(i, :) = self.gmmArray(i).center - sigmaMaximum;
else
miu_plus_sigma(i, :) = sigmaMinimum + sigmaMaximum;
miu_mins_sigma(i, :) = -sigmaMinimum - sigmaMaximum;
end
overlap_mins_mins(i) = mean(miu_mins_sigma(i,:) - miu_mins_sigma_winner);
overlap_mins_plus(i) = mean(miu_plus_sigma(i,:) - miu_mins_sigma_winner);
overlap_plus_mins(i) = mean(miu_mins_sigma(i,:) - miu_plus_sigma_winner);
overlap_plus_plus(i) = mean(miu_plus_sigma(i,:) - miu_plus_sigma_winner);
condition1 = overlap_mins_mins(i) >= 0 ...
&& overlap_mins_plus(i) >= 0 ...
&& overlap_plus_mins(i) <= 0 ...
&& overlap_plus_plus(i) <= 0;
condition2 = overlap_mins_mins(i) <= 0 ...
&& overlap_mins_plus(i) >= 0 ...
&& overlap_plus_mins(i) <= 0 ...
&& overlap_plus_plus(i) >= 0;
condition3 = overlap_mins_mins(i) > 0 ...
&& overlap_mins_plus(i) > 0 ...
&& overlap_plus_mins(i) < 0 ...
&& overlap_plus_plus(i) > 0;
condition4 = overlap_mins_mins(i) < 0 ...
&& overlap_mins_plus(i) > 0 ...
&& overlap_plus_mins(i) < 0 ...
&& overlap_plus_plus(i) < 0;
if condition1 || condition2
% full overlap, the cluster is inside the winning cluster
% the score is full score 1/(nGMM-1)
overlap_score(i) = overlap_coefficient;
elseif condition3 || condition4
% partial overlap, the score is the full score multiplied
% by the overlap degree
reward = norm(self.gmmArray(i).center - self.gmmArray(gmmWinnerIdx).center)...
/ norm(self.gmmArray(i).center + self.gmmArray(gmmWinnerIdx).center)...
+ norm(sqrt(self.gmmArray(i).var) - sqrt(self.gmmArray(gmmWinnerIdx).var))...
/ norm(sqrt(self.gmmArray(i).var) + sqrt(self.gmmArray(gmmWinnerIdx).var));
overlap_score(i) = overlap_coefficient * reward;
end
end
overlap_score(gmmWinnerIdx) = []; % take out the winner score from the array
self.rho = sum(overlap_score);
self.rho = min(self.rho, 1);
self.rho = max(self.rho, 0.1); % Do not let rho = zero
end
function M = computeNumberOfGmms(self)
M = size(self.gmmArray, 1);
end
function M = M(self)
M = self.computeNumberOfGmms();
end
end
end

585
ATL/ATL.m Normal file
View File

@ -0,0 +1,585 @@
filename;
dmS = DataManipulator('');
dmS.loadSourceCSV(filename);
dmT = DataManipulator('');
dmT.loadTargetCSV(filename);
crSource = [];
crTarget = [];
trainTime = [];
testTime = [];
KlLossEvolution = [];
classificationLoss = [];
nodeEvolution = [];
discriminativeLoss = [];
generativeLossTarget = [];
agmmTargetGenSize = [];
agmmSourceDiscSize = [];
nodeEvolutionTarget = [];
nodeEvolutionSource = [];
gmmTargetBatch = [];
gmmSourceBatch = [];
nn = NeuralNetwork([dmS.nFeatures 1 dmS.nClasses]);
ae = DenoisingAutoEncoder([nn.layers(1) nn.layers(2) nn.layers(1)]);
% I am building the greedyLayerBias
x = dmS.getX(1);
ae.greddyLayerWiseTrain(x(1, :), 1, 0.1);
% I am building the greedyLayerBias
agmmSourceDisc = AGMM();
agmmTargetGen = AGMM();
sourceSize = size(dmS.data, 1);
targetSize = size(dmT.data, 1);
sourceIndex = 0;
targetIndex = 0;
i = 0;
originalLearningRate = ae.learningRate;
epochs = 1;
while (sourceIndex + targetIndex) < (sourceSize + targetSize)
i = i + 1;
Xs = [];
ys = [];
Xt = [];
yt = [];
batchCount = 0;
while batchCount < 1000 && (sourceIndex + targetIndex) <= (sourceSize + targetSize)
ratio = (sourceSize - sourceIndex) / (sourceSize + targetSize - sourceIndex - targetIndex);
if (rand(1) <= ratio && sourceIndex < sourceSize) || (targetIndex >= targetSize && sourceIndex < sourceSize)
sourceIndex = sourceIndex + 1;
Xs = [Xs; dmS.getX(sourceIndex)];
ys = [ys; dmS.getY(sourceIndex)];
elseif targetIndex < targetSize
targetIndex = targetIndex + 1;
Xt = [Xt; dmT.getX(targetIndex)];
yt = [yt; dmT.getY(targetIndex)];
end
batchCount = batchCount + 1;
end
%% workaround
if size(Xs, 1) == 0
Xs = [Xs, dmS.getX(sourceSize)];
ys = [ys, dmS.getY(sourceSize)];
end
if size(Xt, 1) == 0
Xt = [Xt, dmT.getX(targetSize)];
yt = [yt, dmT.getY(targetSize)];
end
%% Evaluation ~ Test Target
tic
nn.test(Xt, yt);
crTarget(end + 1) = nn.classificationRate;
classificationLoss(end + 1) = nn.lossValue;
testTime(end + 1) = toc;
nn.test(Xs(max(Xs, [], 2) ~= 0, :), ys(max(Xs, [], 2) ~= 0, :));
crSource(end + 1) = nn.classificationRate;
discriminativeLoss(end + 1) = nn.lossValue;
ae.test(Xt);
generativeLossTarget(end + 1) = ae.lossValue;
if i > 1
nodeEvolutionTarget(end + 1) = nodeEvolutionTarget(i - 1);
nodeEvolutionSource(end + 1) = nodeEvolutionSource(i - 1);
else
nodeEvolutionTarget(end + 1) = 0;
nodeEvolutionSource(end + 1) = 0;
end
tic
for epoch = 1 : epochs
%% Discriminative phase on Source
nn.setAgmm(agmmSourceDisc);
for j = 1 : size(Xs, 1)
x = Xs(j, :);
y = ys(j, :);
if max(y) == 0
continue
end
lastHiddenLayerNo = numel(nn.layers) - 1;
nn.forwardpass(x);
if epoch == 1
agmmSourceDiscSize(end + 1) = nn.runAgmm(x, y).M();
nn.widthAdaptationStepwise(lastHiddenLayerNo, y);
else
nn.nSamplesFeed = nn.nSamplesFeed - 1;
nn.nSamplesLayer(lastHiddenLayerNo) = nn.nSamplesLayer(lastHiddenLayerNo) - 1;
nn.widthAdaptationStepwise(lastHiddenLayerNo, y);
nn.BIAS2{lastHiddenLayerNo}(end) = [];
nn.VAR{lastHiddenLayerNo}(end) = [];
end
if nn.growable(lastHiddenLayerNo)
nodeEvolutionSource(i) = nodeEvolutionSource(i) + nn.getAgmm().M();
for numberOfGMMs = 1 : nn.getAgmm().M()
nn.grow(lastHiddenLayerNo);
ae.grow(lastHiddenLayerNo);
end
elseif nn.prunable{lastHiddenLayerNo}(1) ~= 0
for k = size(nn.prunable{lastHiddenLayerNo}, 1) : -1 : 1
nodeToPrune = nn.prunable{lastHiddenLayerNo}(k);
ae.prune(lastHiddenLayerNo, nodeToPrune);
nn.prune(lastHiddenLayerNo, nodeToPrune);
nodeEvolutionSource(i) = nodeEvolutionSource(i) - 1;
end
end
nn.train(x, y);
end
for j = 1 : numel(nn.layers)-2
ae.weight{j} = nn.weight{j};
ae.bias{j} = nn.bias{j};
end
agmmSourceDisc = nn.getAgmm();
%% Generative phase on Target
ae.setAgmm(agmmTargetGen);
for j = 1 : size(Xt, 1)
x = Xt(j, :);
y = x;
lastHiddenLayerNo = numel(nn.layers) - 1;
ae.forwardpass(x);
if epoch == 1
agmmTargetGenSize(end + 1) = ae.runAgmm(x, y).M();
ae.widthAdaptationStepwise(lastHiddenLayerNo, y);
else
ae.nSamplesFeed = ae.nSamplesFeed - 1;
ae.nSamplesLayer(lastHiddenLayerNo) = ae.nSamplesLayer(lastHiddenLayerNo) - 1;
ae.widthAdaptationStepwise(lastHiddenLayerNo, y);
ae.BIAS2{lastHiddenLayerNo}(end) = [];
ae.VAR{lastHiddenLayerNo}(end) = [];
end
if ae.growable(lastHiddenLayerNo)
nodeEvolutionTarget(i) = nodeEvolutionTarget(i) + ae.getAgmm().M();
for numberOfGMMs = 1 : ae.getAgmm.M()
ae.grow(lastHiddenLayerNo);
nn.grow(lastHiddenLayerNo);
end
elseif ae.prunable{lastHiddenLayerNo}(1) ~= 0
for k = size(ae.prunable{lastHiddenLayerNo}, 1) : -1 : 1
nodeToPrune = ae.prunable{lastHiddenLayerNo}(k);
ae.prune(lastHiddenLayerNo, nodeToPrune);
nn.prune(lastHiddenLayerNo, nodeToPrune);
nodeEvolutionTarget(i) = nodeEvolutionTarget(i) - 1;
end
end
ae.greddyLayerWiseTrain(x, 1, 0.1);
end
for j = 1 : numel(ae.layers)-2
nn.weight{j} = ae.weight{j};
nn.bias{j} = ae.bias{j};
end
agmmTargetGen = ae.getAgmm();
% Kullback-Leibler Divergence
try
common = min(size(Xs,1), size(Xt,1));
KlLossEvolution(end + 1) = ae.updateWeightsByKullbackLeibler(Xs(1:common,:), Xt(1:common,:));
catch
KlLossEvolution(end + 1) = 0;
end
for j = 1 : numel(ae.layers)-2
nn.weight{j} = ae.weight{j};
nn.bias{j} = ae.bias{j};
end
end
if agmmSourceDisc.M() > 1
agmmSourceDisc.deleteCluster();
end
if agmmTargetGen.M() > 1
agmmTargetGen.deleteCluster();
end
trainTime(end + 1) = toc;
gmmTargetBatch(end + 1) = agmmTargetGen.M();
gmmSourceBatch(end + 1) = agmmSourceDisc.M();
%% Print metrics
nodeEvolution(i, :) = nn.layers(2 : end - 1);
if i == 2 || mod(i, round((sourceSize + targetSize)/1000/10)) == 0
fprintf('Minibatch: %d\n', i);
fprintf('Total of samples: %d Source | %d Target\n', size(Xs,1), size(Xt,1));
fprintf('Max Mean Min Now Accu Training time: %f %f %f %f %f\n', max(trainTime(1:i)), mean(trainTime(1:i)), min(trainTime(1:i)), trainTime(i), sum(trainTime(1:i)));
fprintf('Max Mean Min Now Accu Testing time: %f %f %f %f %f\n', max(testTime(1:i)), mean(testTime(1:i)), min(testTime(1:i)), testTime(i), sum(testTime(1:i)));
fprintf('Max Mean Min Now AGMM Source: %d %f %d %d\n', max(agmmSourceDiscSize), mean(agmmSourceDiscSize), min(agmmSourceDiscSize), agmmSourceDiscSize(end));
fprintf('Max Mean Min Now AGMM Target: %d %f %d %d\n', max(agmmTargetGenSize), mean(agmmTargetGenSize), min(agmmTargetGenSize), agmmTargetGenSize(end));
fprintf('Max Mean Min Now CR: %f%% %f%% %f%% %f%%\n', max(crTarget(2:i)) * 100., mean(crTarget(2:i)) * 100., min(crTarget(2:i)) * 100., crTarget(i) * 100.);
fprintf('Max Mean Min Now Classification Loss: %f %f %f %f\n', max(classificationLoss(2:i)), mean(classificationLoss(2:i)), min(classificationLoss(2:i)), classificationLoss(i));
fprintf('Max Mean Min Now KL: %f %f %f %f\n', max(KlLossEvolution(2:i)), mean(KlLossEvolution(2:i)), min(KlLossEvolution(2:i)), KlLossEvolution(i));
fprintf('Max Mean Min Now Nodes: %d %f %d %d\n', max(nodeEvolution(2:i)), mean(nodeEvolution(2:i)), min(nodeEvolution(2:i)), nodeEvolution(i));
fprintf('Network structure: %s (Discriminative) | %s (Generative)\n', num2str(nn.layers(:).'), num2str(ae.layers(:).'));
fprintf('\n');
end
end
fprintf('\n\n')
fprintf('Source CR: %f\n', mean(crSource(2:end)))
fprintf('Target CR: %f\n', mean(crTarget(2:end)))
fprintf('Training time: %f\n', sum(trainTime))
%% ---------------------------- Plotters ----------------------------------
function plotTime(trainTime, testTime)
figure('Name', 'Processing Time', 'NumberTitle', 'off');
hold on
ylim([0 max(max(trainTime), max(testTime)) * 1.1]);
xlim([1 size(trainTime, 2)]);
pTrain = plot(trainTime);
pTest = plot(testTime);
if max(trainTime) > 1
text(find(trainTime == max(trainTime(trainTime > 1)), 1), max(trainTime(trainTime > 1)),...
strcat('\leftarrow Max Train Time:', {' '}, string(max(trainTime(trainTime > 1)))),...
'FontSize', 8,...
'Color', 'black');
text(find(trainTime == min(trainTime(trainTime > 1)), 1), min(trainTime(trainTime > 1)),...
strcat('\leftarrow Min Train Time:', {' '}, string(min(trainTime(trainTime > 1)))),...
'FontSize', 8,...
'Color', 'black');
end
if max(testTime) > 1
text(find(testTime == max(testTime(testTime > 1)), 1), max(testTime(testTime > 1)),...
strcat('\leftarrow Max Test Time:', {' '}, string(max(testTime(testTime > 1)))),...
'FontSize', 8,...
'Color', 'black');
text(find(testTime == min(testTime(testTime > 1)), 1), min(testTime(testTime > 1)),...
strcat('\leftarrow Min Test Time:', {' '}, string(min(testTime(testTime > 1)))),...
'FontSize', 8,...
'Color', 'black');
end
legend([pTrain,...
pTest], [strcat('Train Time Mean | Accumulative:', {' '}, string(mean(trainTime)), {' | '}, string(sum(trainTime))),...
strcat('Test Time Mean | Accumulative:', {' '}, string(mean(testTime)), {' | '}, string(sum(testTime)))]);
ylabel('Time in seconds');
xlabel('Minibatches');
hold off
end
function plotNodeEvolution(nodeEvolution)
figure('Name', 'Node Evolution', 'NumberTitle', 'off');
hold on
ylim([0 max(nodeEvolution, [], 'all') * 1.1]);
xlim([1 size(nodeEvolution, 1)]);
plotArray = [];
legendArray = [];
for i = 1 : size(nodeEvolution, 2)
p = plot(nodeEvolution(:, i));
plotArray = [plotArray, p];
legendArray = [legendArray, strcat('HL', {' '}, string(i), {' '}, 'mean:', {' '}, string(mean(nodeEvolution(nodeEvolution(:, i) > 0, i))))];
text(find(nodeEvolution(:, i) == max(nodeEvolution(:, i)), 1), max(nodeEvolution(:, i)),...
strcat('\leftarrow Max nodes HL ', {' '}, string(i), ':', {' '}, string(max(nodeEvolution(:, i)))),...
'FontSize', 8,...
'Color', 'black');
text(find(nodeEvolution(:, i) == min(nodeEvolution(nodeEvolution(:, i) > 0, i)), 1), min(nodeEvolution(nodeEvolution(:, i) > 0, i)),...
strcat('\leftarrow Min nodes HL ', {' '}, string(i), ':', {' '}, string(min(nodeEvolution(nodeEvolution(:, i) > 0, i)))),...
'FontSize', 8,...
'Color', 'black');
end
ylabel('Number of nodes');
xlabel('Minibatches');
legend(plotArray, legendArray);
hold off
end
function plotAGMM(agmmSource, agmmTarget)
figure('Name', 'Number of GMMs on AGMMs', 'NumberTitle', 'off');
hold on
ylim([0 max(max(agmmTarget), max(agmmSource)) * 1.1]);
xlim([1 size(agmmSource, 2)]);
pAgmmSource = plot(agmmSource);
pAgmmTarget = plot(agmmTarget);
if max(agmmSource) > 1
text(find(agmmSource == max(agmmSource(agmmSource > 1)), 1), max(agmmSource(agmmSource > 1)),...
strcat('\leftarrow Max GMMs Source Discriminative:', {' '}, string(max(agmmSource(agmmSource > 1)))),...
'FontSize', 8,...
'Color', 'black');
text(find(agmmSource == min(agmmSource(agmmSource > 1)), 1), min(agmmSource(agmmSource > 1)),...
strcat('\leftarrow Min GMMs Source Discriminative:', {' '}, string(min(agmmSource(agmmSource > 1)))),...
'FontSize', 8,...
'Color', 'black');
end
if max(agmmTarget) > 1
text(find(agmmTarget == max(agmmTarget(agmmTarget > 1)), 1), max(agmmTarget(agmmTarget > 1)),...
strcat('\leftarrow Max GMMs Target Generative:', {' '}, string(max(agmmTarget(agmmTarget > 1)))),...
'FontSize', 8,...
'Color', 'black');
text(find(agmmTarget == min(agmmTarget(agmmTarget > 1)), 1), min(agmmTarget(agmmTarget > 1)),...
strcat('\leftarrow Min GMMs Target Generative:', {' '}, string(min(agmmTarget(agmmTarget > 1)))),...
'FontSize', 8,...
'Color', 'black');
end
legend([pAgmmSource,...
pAgmmTarget], [strcat('AGMM Source Discriminative Mean:', {' '}, string(mean(agmmSource))),...
strcat('AGMM Target Generative Mean:', {' '}, string(mean(agmmTarget)))]);
ylabel('Number of GMMs');
xlabel('Samples');
hold off
end
function plotLosses(classificationLoss, discriminativeLoss, generativeTargetLoss, kullbackLeiblerLoss)
figure('Name', 'Losses', 'NumberTitle', 'off');
hold on
ylim([0 max(max(kullbackLeiblerLoss), max(max(max(classificationLoss), max(discriminativeLoss)), max(generativeTargetLoss))) * 1.1]);
xlim([1 size(classificationLoss, 1)]);
pClassificationLoss = plot(classificationLoss);
pDiscriminativeLoss = plot(discriminativeLoss);
pGenerativeTargetLoss = plot(generativeTargetLoss);
pKullbackLeiblerLoss = plot(kullbackLeiblerLoss);
text(find(classificationLoss == max(classificationLoss), 1), max(classificationLoss),...
strcat('\leftarrow Max Classification Loss:', {' '}, string(max(classificationLoss))),...
'FontSize', 8,...
'Color', 'black');
text(find(classificationLoss == min(classificationLoss), 1), min(classificationLoss),...
strcat('\leftarrow Min Classification Loss:', {' '}, string(min(classificationLoss))),...
'FontSize', 8,...
'Color', 'black');
text(find(discriminativeLoss == max(discriminativeLoss), 1), max(discriminativeLoss),...
strcat('\leftarrow Max Discriminative Loss:', {' '}, string(max(discriminativeLoss))),...
'FontSize', 8,...
'Color', 'black');
text(find(discriminativeLoss == min(discriminativeLoss), 1), min(discriminativeLoss),...
strcat('\leftarrow Min Discriminative Loss:', {' '}, string(min(discriminativeLoss))),...
'FontSize', 8,...
'Color', 'black');
text(find(generativeTargetLoss == max(generativeTargetLoss), 1), max(generativeTargetLoss),...
strcat('\leftarrow Max Generative Target Loss:', {' '}, string(max(generativeTargetLoss))),...
'FontSize', 8,...
'Color', 'black');
text(find(generativeTargetLoss == min(generativeTargetLoss), 1), min(generativeTargetLoss),...
strcat('\leftarrow Min Generative Target Loss:', {' '}, string(min(generativeTargetLoss))),...
'FontSize', 8,...
'Color', 'black');
text(find(kullbackLeiblerLoss == max(kullbackLeiblerLoss), 1), max(kullbackLeiblerLoss),...
strcat('\leftarrow Max KL Div Loss:', {' '}, string(max(kullbackLeiblerLoss))),...
'FontSize', 8,...
'Color', 'black');
text(find(kullbackLeiblerLoss == min(kullbackLeiblerLoss), 1), min(kullbackLeiblerLoss),...
strcat('\leftarrow Min KL Div Loss:', {' '}, string(min(kullbackLeiblerLoss))),...
'FontSize', 8,...
'Color', 'black');
ylabel('Loss Value');
xlabel('Minibatches');
legend([pClassificationLoss,...
pDiscriminativeLoss,...
pGenerativeTargetLoss,...
pKullbackLeiblerLoss], [strcat('Classification Loss Mean:', {' '}, string(mean(classificationLoss(2:end)))),...
strcat('Discriminative Loss Mean:', {' '}, string(mean(discriminativeLoss))),...
strcat('Generative Target Loss Mean:', {' '}, string(mean(generativeTargetLoss))),...
strcat('Kullback Leibler Divergence Loss Mean:', {' '}, string(mean(kullbackLeiblerLoss)))]);
hold off
end
function plotClassificationRate(source, target, nMinibatches)
figure('Name', 'Source and Target Classification Rates', 'NumberTitle', 'off');
hold on
ylim([0 max(max(source), max(target)) * 1.1]);
xlim([1 nMinibatches]);
niceBlue = [0 0.4470 0.7410];
niceYellow = [0.8500 0.3250 0.0980];
pSource = plot(source, 'Color', niceYellow, 'LineStyle', ':');
pTarget = plot(target, 'Color', niceBlue);
text(find(source == max(source), 1), max(source),...
strcat('\leftarrow Max Source:', {' '}, string(max(source))),...
'FontSize', 8,...
'Color', 'black');
text(find(source == min(source), 1), min(source),...
strcat('\leftarrow Min Source:', {' '}, string(min(source))),...
'FontSize', 8,...
'Color', 'black');
text(find(target == max(target), 1), max(target),...
strcat('\leftarrow Max Target:', {' '}, string(max(target))),...
'FontSize', 8,...
'Color', 'black');
text(find(target == min(target), 1), min(target),...
strcat('\leftarrow Min Target:', {' '}, string(min(target))),...
'FontSize', 8,...
'Color', 'black');
ylabel('Classification Rate');
xlabel('Minibatches');
legend([pSource, pTarget], [strcat('Source Mean:', {' '}, string(mean(source(2:end)))),...
strcat('Target Mean:', {' '}, string(mean(target(2:end))))]);
hold off
end
function plotBIAS2andVAR(BIAS2, VAR)
sampleLayerCount = zeros(1, size(BIAS2, 2));
yAxisLim = 0;
bias2 = [];
var = [];
for i = 2 : size(BIAS2, 2)
sampleLayerCount(i) = sampleLayerCount(i - 1) + size(BIAS2{i}, 2);
for j = 1 : size(BIAS2{i}, 2)
bias2 = [bias2, BIAS2{i}(j)];
var = [var, VAR{i}(j)];
yAxisLim = max(yAxisLim, bias2(end) + var(end));
end
end
clear BIAS2 VAR
figure('Name', 'BIAS2, VAR, and NS', 'NumberTitle', 'off');
hold on
ylim([0 max(max(bias2), max(var)) * 1.1]);
xlim([1 size(bias2, 2)]);
p1 = plot(bias2);
p2 = plot(var);
p3 = plot(bias2 + var);
for j = 1: ceil(size(bias2, 2)/4) : size(bias2, 2)
if ~isnan(bias2(j))
text(j, bias2(j),...
strcat('\leftarrow', {' '}, 'BIAS2 =', {' '}, string(bias2(j))),...
'FontSize', 8);
end
end
text(size(bias2, 2), bias2(end), string(bias2(end)));
for j = 1: ceil(size(var, 2)/4) : size(var, 2)
if ~isnan(var(j))
text(j, var(j),...
strcat('\leftarrow', {' '}, 'VAR =', {' '}, string(var(j))),...
'FontSize', 8);
end
end
text(size(var, 2), var(end), string(var(end)));
for j = 1: ceil(size(var + bias2, 2)/4) : size(var + bias2, 2)
if ~isnan(var(j)) && ~isnan(bias2(j)) && ~isnan(var(j) + bias2(j))
text(j, var(j) + bias2(j),...
strcat('\leftarrow', {' '}, 'NS =', {' '}, string(var(j) + bias2(j))),...
'FontSize', 8);
end
end
text(size(var + bias2, 2), var(end) + bias2(end), string(var(end) + bias2(end)));
for i = 2 : size(sampleLayerCount, 2) - 1
line([sampleLayerCount(i), sampleLayerCount(i)], [-yAxisLim * 2 yAxisLim * 2],...
'LineStyle', ':',...
'Color', 'magenta');
end
ylabel('Value');
xlabel('Sample');
legend([p1, p2, p3], [strcat('BIAS2 Mean:', {' '}, string(mean(bias2(2:end)))),...
strcat('VAR Mean:', {' '}, string(mean(var(2:end)))),...
strcat('NS Mean:', {' '}, string(mean(var(2:end) + bias2(2:end))))]);
hold off
end
function plotBIAS2andVARGen(BIAS2, VAR)
sampleLayerCount = zeros(1, size(BIAS2, 2));
yAxisLim = 0;
bias2 = [];
var = [];
for i = 2 : size(BIAS2, 2)
sampleLayerCount(i) = sampleLayerCount(i - 1) + size(BIAS2{i}, 2);
for j = 1 : size(BIAS2{i}, 2)
bias2 = [bias2, BIAS2{i}(j)];
var = [var, VAR{i}(j)];
yAxisLim = max(yAxisLim, bias2(end) + var(end));
end
end
clear BIAS2 VAR
figure('Name', 'BIAS2, VAR, and NS Generative', 'NumberTitle', 'off');
hold on
ylim([0 max(max(bias2), max(var)) * 1.1]);
xlim([1 size(bias2, 2)]);
p1 = plot(bias2);
p2 = plot(var);
p3 = plot(bias2 + var);
for j = 1: ceil(size(bias2, 2)/4) : size(bias2, 2)
if ~isnan(bias2(j))
text(j, bias2(j),...
strcat('\leftarrow', {' '}, 'BIAS2 =', {' '}, string(bias2(j))),...
'FontSize', 8);
end
end
text(size(bias2, 2), bias2(end), string(bias2(end)));
for j = 1: ceil(size(var, 2)/4) : size(var, 2)
if ~isnan(var(j))
text(j, var(j),...
strcat('\leftarrow', {' '}, 'VAR =', {' '}, string(var(j))),...
'FontSize', 8);
end
end
text(size(var, 2), var(end), string(var(end)));
for j = 1: ceil(size(var + bias2, 2)/4) : size(var + bias2, 2)
if ~isnan(var(j)) && ~isnan(bias2(j)) && ~isnan(var(j) + bias2(j))
text(j, var(j) + bias2(j),...
strcat('\leftarrow', {' '}, 'NS =', {' '}, string(var(j) + bias2(j))),...
'FontSize', 8);
end
end
text(size(var + bias2, 2), var(end) + bias2(end), string(var(end) + bias2(end)));
for i = 2 : size(sampleLayerCount, 2) - 1
line([sampleLayerCount(i), sampleLayerCount(i)], [-yAxisLim * 2 yAxisLim * 2],...
'LineStyle', ':',...
'Color', 'magenta');
end
ylabel('Value');
xlabel('Sample');
legend([p1, p2, p3], [strcat('BIAS2 Mean:', {' '}, string(mean(bias2))),...
strcat('VAR Mean:', {' '}, string(mean(var))),...
strcat('NS Mean:', {' '}, string(mean(var + bias2)))]);
hold off
end

183
ATL/AutoEncoder.m Normal file
View File

@ -0,0 +1,183 @@
classdef AutoEncoder < NeuralNetwork
%AutoEncoder
% This object mimics the behavior of a Auto Encoder network, which is
% a Neural Network that has the output equal to input.
% This object has elastic habilities, being able to grow and prune
% nodes automatically.
% TODO: Provide the paper or study material for the Auto Encoder
properties (Access = protected)
greedyLayerBias = [];
greedyLayerOutputBias;
end
methods (Access = public)
function self = AutoEncoder(layers)
% AutoEncoder
% layers (array)
% This array describes a FeedForward Network structure by
% the number of layers on it.
% An FFNN with an input layer of 8 nodes, a hidden layer
% of 10 nodes and an output layer of 3 nodes would be
% described by [8 10 3].
% An FFNN with an input layer of 784 nodes, a hidden
% layer 1 of 800 nodes, a hidden layer 2 of 400 nodes and
% an output layer of 10 nodes would be described as [784 800 400 10]
self@NeuralNetwork(layers);
self.outputActivationFunctionLossFunction = self.ACTIVATION_LOSS_FUNCTION_SIGMOID_MSE();
end
function test(self, X)
% test
% See test@NeuralNetwork
% X (matrix)
% Input and output data
test@NeuralNetwork(self, X, X)
end
function grow(self, layerNo)
grow@NeuralNetwork(self, layerNo);
self.growGreedyLayerBias(layerNo);
end
function prune(self, layerNo, nodeNo)
prune@NeuralNetwork(self, layerNo, nodeNo);
self.pruneGreedyLayerBias(layerNo, nodeNo);
end
function growGreedyLayerBias(self, layerNo)
b = layerNo; %readability
if b == (numel(self.layers) - 1)
self.greedyLayerOutputBias = [self.greedyLayerOutputBias normrnd(0, sqrt(2 / (self.layers(end-1) + 1)))];
else
self.greedyLayerBias{b} = [self.greedyLayerBias{b} normrnd(0, sqrt(2 / (self.layers(b) + 1)))];
end
end
function growLayer(self, option, numberOfNodes)
if option == self.CREATE_MIRRORED_LAYER()
nhl = self.nHiddenLayers + 1;
growLayer@NeuralNetwork(self, self.CREATE_LAYER_BY_ARGUMENT(), numberOfNodes);
growLayer@NeuralNetwork(self, self.CREATE_LAYER_BY_ARGUMENT(), self.layers(nhl));
else
growLayer@NeuralNetwork(self, option, numberOfNodes);
self.greedyLayerBias{size(self.greedyLayerBias, 2) + 1} = self.greedyLayerOutputBias;
self.greedyLayerOutputBias = normrnd(0, sqrt(2 / (self.layers(end-1) + 1)));
end
end
function pruneGreedyLayerBias(self, layerNo, nodeNo)
b = layerNo; % readability
n = nodeNo; %readability
if b == (numel(self.layers) - 1)
self.greedyLayerOutputBias(n) = [];
else
self.greedyLayerBias{b}(n) = [];
end
end
function greddyLayerWiseTrain(self, X, nEpochs, noiseRatio)
%greddyLayerWiseTrain
% Performs Greedy Layer Wise train
% X (matrix)
% Input and output data
% nEpochs (integer)
% The number of epochs which the greedy layer wise train
% will occurs. If you are running a single pass model,
% you want this to be equal one.
if nargin == 3
noiseRatio = 0;
end
% disp(self.layers)
for i = 1 : numel(self.layers) - 1
self.forwardpass(X);
trainingX = self.layerValue{i};
Xnoise = (rand(size(trainingX)) >= noiseRatio) .* trainingX;
if i > self.nHiddenLayers
nn = NeuralNetwork([self.layers(i) self.layers(end) self.layers(i)]);
else
nn = NeuralNetwork([self.layers(i) self.layers(i+1) self.layers(i)]);
end
nn.outputActivationFunctionLossFunction = self.ACTIVATION_LOSS_FUNCTION_SIGMOID_MSE();
if i > self.nHiddenLayers
nn.weight{1} = self.outputWeight;
nn.bias{1} = self.outputBias;
nn.outputWeight = self.outputWeight';
if isempty(self.greedyLayerOutputBias)
self.greedyLayerOutputBias = normrnd(0, sqrt(2 / (size(self.outputWeight', 2) + 1)),...
1, size(self.outputWeight', 1));
nn.outputBias = self.greedyLayerOutputBias;
else
nn.outputBias = self.greedyLayerOutputBias;
end
else
nn.weight{1} = self.weight{i};
nn.bias{1} = self.bias{i};
nn.outputWeight = self.weight{i}';
try
nn.outputBias = self.greedyLayerBias{i};
catch
self.greedyLayerBias{i} = normrnd(0, sqrt(2 / (size(self.weight{i}', 2) + 1)),...
1, size(self.weight{i}', 1));
nn.outputBias = self.greedyLayerBias{i};
end
end
for j = 1 : nEpochs
nn.train(Xnoise, trainingX);
end
if i > self.nHiddenLayers
self.outputWeight = nn.weight{1};
self.outputBias = nn.bias{1};
else
self.weight{i} = nn.weight{1};
self.bias{i} = nn.bias{1};
end
end
end
function loss = updateWeightsByKullbackLeibler(self, Xs, Xt, GAMMA)
if nargin == 3
GAMMA = 0.0001;
end
loss = updateWeightsByKullbackLeibler@NeuralNetwork(self, Xs, Xs, Xt, Xt, GAMMA);
end
end
methods (Access = protected)
function BIAS2 = computeBIAS2(~, Ez, y)
%getBIAS2
% The way AutoEncoders calculata its BIAS2 value per layer is
% different than normal neural networks. Because we use
% sigmoid as our output activation function, and because the
% error is too high, we prefer use mean as a way to squish
% the bias2
% Ez (double, vector or matrix)
% Expected outbound value of that layer
% y (double, vector or matrix)
% A target class
%
% return BIAS2 = The network squared BIAS
BIAS2 = mean((Ez - y') .^ 2);
end
function var = computeVAR(~, Ez, Ez2)
%getVAR
% The way AutoEncoders calculata its VAR value per layer is
% different than normal neural networks. Because we use
% sigmoid as our output activation function, and because the
% error is too high, we prefer use mean as a way to squish
% the bias2
% Ez (double, vector or matrix)
% Expected outbound value of that layer
% Ez2 (double, vector or matrix)
% Expected outbound squared value of that layer
%
% return VAR = The network VAR (variance)
var = mean(Ez2 - Ez .^ 2);
end
end
end

396
ATL/DataManipulator.m Normal file
View File

@ -0,0 +1,396 @@
classdef DataManipulator < handle
%DataManipulator It manipulates and prepare data
% It manipulates and prepare data used to train and test our research
% models.
% It is already prepared to load and interact with mostly of the data
% used in our lab.
properties (Access = public)
data = []; % Whole dataset
nFeatures = 0; % Number of features from the dataset
nClasses = 0; % Number of classes from the dataset
nFoldElements = 0; % Number of elements per fold
nMinibatches = 0; % Number of minibatches
source = {}; % Souce data
target = {}; % Target data
end
properties (Access = private)
X = {}; % Input data
y = {}; % Class data
Xs = {}; % Source input data
ys = {}; % Source class data
Xt = {}; % Target input data
yt = {}; % Target class data
permutedX = {}; % Permutted Input data
permutedy = {}; % Permutted Class data
indexPermutation = {}; % Permuttation index (in order to know if it source or target)
dataFolderPath = '';
end
methods (Access = public)
function self = DataManipulator(dataFolderPath)
self.dataFolderPath = dataFolderPath;
end
function loadSourceCSV(self, dataset)
self.loadCustomCSV(join([dataset, '_source.csv']))
end
function loadTargetCSV(self, dataset)
self.loadCustomCSV(join([dataset, '_target.csv']))
end
function loadCustomCSV(self, filename)
self.data = [];
self.data = csvread(strcat(self.dataFolderPath, filename));
self.checkDatasetEven();
self.data = double(self.data);
self.nFeatures = size(self.data, 2) - 1;
self.nClasses = 1;
self.X = self.data(:,1:end-self.nClasses);
self.y = self.data(:,self.nFeatures+1:end);
self.nClasses = max(self.y);
y_one_hot = zeros(size(self.y, 1), self.nClasses);
for i = 1 : self.nClasses
rows = self.y == i;
y_one_hot(rows, i) = 1;
end
self.y = y_one_hot;
self.data = [self.X self.y];
end
function normalize(self)
%normalize
% Normalize every feature between 0 and 1
fprintf('Normalizing data\n');
for i = 1 : self.nFeatures
self.data(:, i) = (self.data(:, i) - min(self.data(:, i), [], 'all'))/max(self.data(:, i), [], 'all');
end
self.X = self.data(:, 1 : self.nFeatures);
self.y = self.data(:, self.nFeatures + 1 : end);
end
function splitAsSourceTargetStreams(self, nFoldElements, method, samplingRatio)
%splitAsSourceTargetStreams
% Split the function to simulate a Multistream classification
% input domains.
% In a Multistream classification problem, we consider that
% two different but related processes generate data
% continuously from a domain D (in this case, self.data). The
% first process operates in a supervised environment, i.e.,
% all the data instances that are generated from the first
% process are labeled. On the contraty, the second process
% generates unlabeled data from the same domain. The stream
% of data generated form the above processes are called the
% source stream and the target stream.
% This functions will return label for the target stream,
% which the user should only use for ensemble evaluation
% purposes
% nFoldElements (integer)
% Both source and target data will be splited in chunks
% of data containing n elements per chunk/fold.
% If you only want one chunk, pass zero or size(data,1)
% as argument.
% method (string)
% What kind of method will be used to generated
% distribute the data into source and target. Usually,
% Multistream Classification problems distribute the data
% using some bias probability.
% Options:
% 'none': Source and Target streams will be splited on
% half
% 'dallas_1: Source and Target streams will be splited
% on half using the bias described by paper "An
% adaptive framework for multistream classification"
% from the CS deparment of the university of Texas at
% Dallas
% 'dallas_2:' Source and Target streams will be
% splited on half using the bias described by paper
% "FUSION - An online method for multistream
% classification" from the university of Texas at
% Dallas.
% samplingRatio (double)
% Value in the interval [0.0,1.0] which describes the
% percentage of sampling that would go to Source Stream.
% Target will have 1 - n percentagem of data.
if nFoldElements == 0
self.nFoldElements = length(self.data);
else
self.nFoldElements = nFoldElements;
end
switch method
case 'none'
self.splitAsSourceTargetStreams_none(self.nFoldElements, samplingRatio);
case 'dallas_1'
self.splitAsSourceTargetStreams_dallas1(self.nFoldElements, samplingRatio);
case 'dallas_2'
self.splitAsSourceTargetStreams_dallas2(self.nFoldElements, samplingRatio);
end
self.createXsYsXtYt()
end
function X = getX(self, idx)
X = self.X(idx,:);
end
function y = getY(self, idx)
y = self.y(idx,:);
end
function Xs = getXs(self, nMinibatch)
%getXs
% Get the input matrix from a specific source data stream.
% The source stream will be only created when we are dealing
% with a dataset that was splitted into source and target
% data streams.
% nMinibatch (integer)
% The minibatch iteration
Xs = self.Xs{nMinibatch};
end
function ys = getYs(self, nMinibatch)
%getXs
% Get the target matrix from a specific source data stream.
% The source stream will be only created when we are dealing
% with a dataset that was splitted into source and target
% data streams.
% nMinibatch (integer)
% The minibatch iteration
ys = self.ys{nMinibatch};
end
function Xt = getXt(self, nMinibatch)
%getXt
% Get the input matrix from a specific target data stream.
% The target stream will be only created when we are dealing
% with a dataset that was splitted into source and target
% data streams.
% nMinibatch (integer)
% The minibatch iteration
Xt = self.Xt{nMinibatch};
end
function yt = getYt(self, nMinibatch)
%getXs
% Get the target matrix from a specific target data stream.
% The target stream will be only created when we are dealing
% with a dataset that was splitted into source and target
% data streams.
% nMinibatch (integer)
% The minibatch iteration
yt = self.yt{nMinibatch};
end
end
methods (Access = private)
function splitAsSourceTargetStreams_none(self, elementsPerFold, samplingRatio)
%splitAsSourceTargetStreams_none
% Split the function to simulate a Multistream classification
% input domains.
%
% Source and Target streams will be splited on half
%
% nFoldElements (integer)
% Both source and target data will be splited in chunks
% of data containing n elements per chunk/fold.
% If you only want one chunk, pass zero or size(data,1)
% as argument.
% samplingRatio (double)
% Value in the interval [0.0,1.0] which describes the
% percentage of sampling that would go to Source Stream.
% Target will have 1 - n percentagem of data.
[rowsNumber, ~] = size(self.data);
self.nFoldElements = elementsPerFold;
j = 0;
b = 1;
i = 1;
source = [];
while i < size(self.data, 1)
while j < self.nFoldElements && i < size(self.data, 1)
source = [source; self.data(i,:)];
j = j + 1;
i = i + 1;
end
self.source{b} = source;
self.target{b} = source;
source = [];
j = 0;
b = b + 1;
end
self.nMinibatches = b - 1;
end
function splitAsSourceTargetStreams_dallas1(self, elementsPerFold, samplingRatio)
%splitAsSourceTargetStreams_dallas1
% Split the function to simulate a Multistream classification
% input domains.
%
% Source and Target streams will be splited on half using the
% bias described by paper "An adaptive framework for
% multistream classification" from the CS deparment of the
% university of Texas at Dallas
%
% nFoldElements (integer)
% Both source and target data will be splited in chunks
% of data containing n elements per chunk/fold.
% If you only want one chunk, pass zero or size(data,1)
% as argument.
% samplingRatio (double)
% Value in the interval [0.0,1.0] which describes the
% percentage of sampling that would go to Source Stream.
% Target will have 1 - n percentagem of data.
[rowsNumber, ~] = size(self.data);
numberOfFolds = round(length(self.data)/elementsPerFold);
chunkSize = round(rowsNumber/numberOfFolds);
numberOfFoldsRounded = round(rowsNumber/chunkSize);
self.nFoldElements = min(elementsPerFold, length(self.data)/numberOfFoldsRounded);
if length(self.data)/numberOfFoldsRounded > elementsPerFold
numberOfFolds = numberOfFolds + 1;
end
self.nMinibatches = numberOfFolds;
ck = self.nFoldElements;
for i = 1:numberOfFolds
x = [];
data = [];
if i > numberOfFoldsRounded
x = self.data(ck * (i-1) + 1:end,1:end-self.nClasses);
data = self.data(ck * (i-1) + 1:end,1:end);
else
x = self.data(ck * (i-1) + 1:ck * i,1:end-self.nClasses);
data = self.data(ck * (i-1) + 1:ck * i,1:end);
end
x_mean = mean(x);
probability = exp(-abs(x - x_mean).^2);
[~,idx] = sort(probability);
m = size(data,1);
source = data(idx(1:ceil(m*samplingRatio)),1:end);
target = data(idx(ceil(m*samplingRatio)+1:length(data)),1:end);
self.source{i} = source;
self.target{i} = target;
end
end
function splitAsSourceTargetStreams_dallas2(self, elementsPerFold, samplingRatio)
%splitAsSourceTargetStreams_dallas2
% Split the function to simulate a Multistream classification
% input domains.
%
% Source and Target streams will be splited on half using the
% bias described by paper "FUSION - An online method for
% multistream classification" from the university of Texas at
% Dallas.
%
% nFoldElements (integer)
% Both source and target data will be splited in chunks
% of data containing n elements per chunk/fold.
% If you only want one chunk, pass zero or size(data,1)
% as argument.
% samplingRatio (double)
% Value in the interval [0.0,1.0] which describes the
% percentage of sampling that would go to Source Stream.
% Target will have 1 - n percentagem of data.
[rowsNumber, ~] = size(self.data);
numberOfFolds = round(length(self.data)/elementsPerFold);
chunkSize = round(rowsNumber/numberOfFolds);
numberOfFoldsRounded = round(rowsNumber/chunkSize);
if mod(floor(size(self.data, 1)/numberOfFoldsRounded), 2) == 0
self.nFoldElements = min(elementsPerFold, floor(size(self.data, 1)/numberOfFoldsRounded));
else
self.nFoldElements = min(elementsPerFold, floor(size(self.data, 1)/numberOfFoldsRounded) - 1);
end
if length(self.data)/numberOfFoldsRounded > elementsPerFold
numberOfFolds = numberOfFolds + 1;
end
self.nMinibatches = numberOfFolds;
ck = self.nFoldElements;
for i = 1 : numberOfFolds
x = [];
data = [];
if i > numberOfFoldsRounded
x = self.data(ck * (i-1) + 1:end,1:end-self.nClasses);
data = self.data(ck * (i-1) + 1:end,1:end);
else
x = self.data(ck * (i-1) + 1:ck * i,1:end-self.nClasses);
data = self.data(ck * (i-1) + 1:ck * i,1:end);
end
x_mean = mean(x);
norm_1 = vecnorm((x - x_mean)',1)';
norm_2 = vecnorm((x - x_mean)',2)';
numerator = norm_2;
denominator = 2 * std(norm_1) ^ 2;
probability = exp(-numerator/denominator);
[~,idx] = sort(probability);
m = size(data,1);
source = data(idx(1 : ceil(m * samplingRatio)), 1 : end);
target = data(idx(ceil(m * samplingRatio) + 1: size(data, 1)), 1 : end);
self.source{i} = source;
self.target{i} = target;
end
end
function createXsYsXtYt(self)
%createXsYsXtYt
% Split the datastream data into sets of input, output, input
% from source, output from source, input from target, output
% from target
% It also creates a permutted version of this data, in
self.X = {};
self.y = {};
self.Xs = {};
self.ys = {};
self.Xt = {};
self.yt = {};
self.permutedX = {};
self.permutedy = {};
for i = 1 : self.nMinibatches
self.Xs{i} = self.source{i}(:,1:end-self.nClasses);
self.ys{i} = self.source{i}(:,self.nFeatures+1:end);
self.Xt{i} = self.target{i}(:,1:end-self.nClasses);
self.yt{i} = self.target{i}(:,self.nFeatures+1:end);
self.X{i} = [self.Xs{i};self.Xt{i}];
self.y{i} = [self.ys{i};self.yt{i}];
x = self.X{i};
Y = self.y{i};
p = randperm(size(x, 1));
self.permutedX{i} = x(p,:);
self.permutedy{i} = Y(p,:);
self.indexPermutation{i} = p;
end
end
function checkDatasetEven(self)
%checkDatasetEven
% Check if the number of rows in the whole dataset is even,
% so we can split in a equal number of elements for source
% and stream (when splitting by 0.5 ratio)
% If the number is odd, randomly trow a row away.
if mod(length(self.data),2) ~= 0
p = ceil(rand() * length(self.data));
self.data = [self.data(1:p-1,:);self.data(p+1:end,:)];
end
end
end
end

View File

@ -0,0 +1,75 @@
classdef DenoisingAutoEncoder < AutoEncoder
%DenoisingAutoEncoder
% This object mimics the behavior of a Denoising Auto Encoder network,
% which is an Auto Encoder that receives noised input data and tries
% to denoise it
% This object has elastic habilities, being able to grow and prune
% nodes automatically.
% TODO: Provide the paper or study material for the Denoising Auto Encoder
methods (Access = public)
% function self = DenoisingAutoEncoder(nInput, nHiddenNodes)
% %DenoisingAutoEncoder Construct an instance of this class
% % nInput (integer)
% % Number of input nodes
% % nHiddenNodes (integer)
% % Number of nodes at the hidden layer
% self@AutoEncoder(nInput, nHiddenNodes);
% end
function self = DenoisingAutoEncoder(layers)
%DenoisingAutoEncoder Construct an instance of this class
self@AutoEncoder(layers);
end
function train(self, X, noiseRatio, nWeight)
% train
% See train@NeuralNetwork
% X (matrix)
% Input and output data
% noiseRatio (double)
% Value between 0.0 and 1.0
% It indicates the percentage of noise that will be
% applied on the input datapreparing the network for
% another kind of data.
% nWeight (integer) [optional]
% You has the ability to define which weight and bias you
% want to update using backpropagation. This method will
% update only that weight and bias, even if there is
% weights and biases on layers before and after that.
% The number of the weight and bias you want to update.
% Remember that 1 indicates the weight and bias that get
% out of the input layer.
if nargin == 3
train@AutoEncoder(self, X, noiseRatio)
elseif nargin == 4
train@AutoEncoder(self, X, noiseRatio, nWeight);
end
end
function greddyLayerWiseTrain(self, X, nEpochs, noiseRatio)
%greddyLayerWiseTrain
% Performs Greedy Layer Wise train
% TODO: Provide the paper or study material for the Greedy
% layer Wise train
% X (matrix)
% Input and output data
% nEpochs (integer)
% The number of epochs which the greedy layer wise train
% will occurs. If you are running a single pass model,
% you want this to be equal one.
% noiseRatio (double)
% Value between 0.0 and 1.0
% It indicates the percentage of noise that will be
% isTiedWeight (bool) [optional]
% On a Tied Weight training, after the train the weights
% after the middle layer will be a transpose version of
% the weights before the middle layer. The bias is still
% kept. This make the network find it hard to train, and
% that's is good when we are preparing the network for
% another kind of data.
greddyLayerWiseTrain@AutoEncoder(self, X, nEpochs, noiseRatio);
end
end
end

110
ATL/ElasticNodes.m Normal file
View File

@ -0,0 +1,110 @@
classdef ElasticNodes < handle
%ELASTICNODES It encapsulate global variables necessary for width
%adaptation
%
% This class enabless elastic network width. Network width adaptation
% supports automatic generation of new hidden nodes and prunning of
% inconsequential nodes. This mechanism is controlled by the NS
% (Network Significance) method which estimates the network
% generalization power in terms of bias and variance
properties (Access = public)
growable; % See full comment below
% Hold an array of boolean elements indicating if that layer can
% receive grow or not during width adaptation procedure
prunable; % See full comment below
% Hold an array of integer elements indicating if that layer can
% receive prune or not during width adaptation procedure.
% 0 indicates that no node should be pruned. Anything different
% than zero indicantes which node should be pruned in that layer.
end
properties (Access = public)
dataMean = 0;
dataStd = 0;
dataVar = 0;
nSamplesFeed = 0;
nSamplesLayer;
% NS = Network Significance
%BIAS VARIABLES
meanBIAS;
varBIAS;
stdBIAS;
minMeanBIAS;
minStdBIAS;
BIAS2;
%VAR VARIABLES
meanVAR;
varVAR;
stdVAR;
minMeanVAR;
minStdVAR;
VAR;
% metrics
nodeEvolution = {}; % TODO: Need to include at the grow/prune part
end
%% Evolving layers properties
properties (Access = public)
alpha = 0.005;
gradientBias = [];
meanNetBias2;
meanNetVar;
end
methods (Access = protected)
function self = ElasticNodes(nHiddenLayers)
nhl = nHiddenLayers; % readability
self.nSamplesLayer = zeros(1,nhl);
self.meanBIAS = zeros(1,nhl);
self.varBIAS = zeros(1,nhl);
self.stdBIAS = zeros(1,nhl);
self.minMeanBIAS = ones(1,nhl) * inf;
self.minStdBIAS = ones(1,nhl) * inf;
self.BIAS2 = num2cell(zeros(1,nhl));
self.meanVAR = zeros(1,nhl);
self.varVAR = zeros(1,nhl);
self.stdVAR = zeros(1,nhl);
self.minMeanVAR = ones(1,nhl) * inf;
self.minStdVAR = ones(1,nhl) * inf;
self.VAR = num2cell(zeros(1,nhl));
self.growable = zeros(1,nhl);
% self.prunable = zeros(1,nhl);
self.prunable = cell(1,nhl);
for i = 1 : nhl
self.prunable{i} = 0;
end
end
function growLayerEvolutiveParameter(self, numberHiddenLayers)
nhl = numberHiddenLayers; %readability
self.nSamplesLayer = [self.nSamplesLayer, 0];
self.meanBIAS = [self.meanBIAS, 0];
self.varBIAS = [self.varBIAS, 0];
self.stdBIAS = [self.stdBIAS, 0];
self.minMeanBIAS = [self.minMeanBIAS, 0];
self.minStdBIAS = [self.minStdBIAS, 0];
self.BIAS2 = [self.BIAS2, 0];
self.meanVAR = [self.meanVAR, 0];
self.varVAR = [self.varVAR, 0];
self.stdVAR = [self.stdVAR, 0];
self.minMeanVAR = [self.minMeanVAR, 0];
self.minStdVAR = [self.minStdVAR, 0];
self.VAR = [self.VAR, 0];
self.growable = zeros(1, nhl + 1);
self.prunable = cell(1, nhl + 1);
for i = 1 : nhl + 1
self.prunable{i} = 0;
end
end
end
end

41
ATL/GMM.m Normal file
View File

@ -0,0 +1,41 @@
classdef GMM < handle
%GMM Gaussian Mixture Model
properties (Access = public)
weight = 1;
center = 1;
var = [];
winCounter = 1;
inferenceSum = 0;
surviveCounter = 0;
yCount;
inference = 0;
hyperVolume = 0;
end
properties (Access = private)
nFeatures;
end
methods (Access = public)
function self = GMM(x)
%GMM
% x (vector)
% A data-sample (without its target) representing the
% initial GMM center
self.nFeatures = size(x, 2);
self.center = x;
self.var = 0.01 * ones(1, self.nFeatures);
end
function computeInference(self, x)
c = self.center;
dist = (x - c) .^ 2 ./ self.var;
[self.inference, maxMahalDistIdx] = min(exp(-0.5 * dist));
self.hyperVolume = self.var(maxMahalDistIdx);
end
end
end

1009
ATL/NeuralNetwork.m Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,79 @@
classdef NeuralNetworkConstants < handle
%% Rules for automatic creation of new layers
methods (Access = public)
function const = CREATE_LAYER_WITH_ONE_NODE(~)
const = 1;
end
function const = CREATE_LAYER_EQUAL_OUTPUT(~)
const = 2;
end
function const = CREATE_LAYER_BY_ARGUMENT(~)
const = 3;
end
function const = CREATE_MIRRORED_LAYER(~)
const = 16;
end
end
%% Rules for prune nodes in a layer
methods (Access = public)
function const = PRUNE_SINGLE_LEAST_CONTRIBUTION_NODES(~)
const = 14;
end
function const = PRUNE_MULTIPLE_NODES_WITH_CONTRIBUTION_BELOW_EXPECTED(~)
const = 15;
end
end
%% Activation functions
methods (Access = public)
function const = ACTIVATION_FUNCTION_SIGMOID(~)
const = 4;
end
function const = ACTIVATION_FUNCTION_TANH(~)
const = 5;
end
function const = ACTIVATION_FUNCTION_RELU(~)
const = 6;
end
function const = ACTIVATION_FUNCTION_LINEAR(~)
const = 7;
end
function const = ACTIVATION_FUNCTION_SOFTMAX(~)
const = 8;
end
end
%% Activation functions and Loss functions (normally used as output activation function)
methods (Access = public)
function const = ACTIVATION_LOSS_FUNCTION_SIGMOID_MSE(~)
const = 9;
end
function const = ACTIVATION_LOSS_FUNCTION_TANH(~)
const = 10;
end
function const = ACTIVATION_LOSS_FUNCTION_RELU(~)
const = 11;
end
function const = ACTIVATION_LOSS_FUNCTION_SOFTMAX_CROSS_ENTROPY(~)
const = 12;
end
function const = ACTIVATION_LOSS_FUNCTION_LINEAR_CROSS_ENTROPY(~)
const = 13;
end
end
end

86
ATL/Util.m Normal file
View File

@ -0,0 +1,86 @@
% Marcus Vinicius Sousa Leite de Carvalho
% marcus.decarvalho@ntu.edu.sg
%
% NANYANG TECHNOLOGICAL UNIVERSITY - NTUITIVE PTE LTD Dual License Agreement
% Non-Commercial Use Only
% This NTUITIVE License Agreement, including all exhibits ("NTUITIVE-LA") is a legal agreement between you and NTUITIVE (or we) located at 71 Nanyang Drive, NTU Innovation Centre, #01-109, Singapore 637722, a wholly owned subsidiary of Nanyang Technological University (NTU) for the software or data identified above, which may include source code, and any associated materials, text or speech files, associated media and "online" or electronic documentation and any updates we provide in our discretion (together, the "Software").
%
% By installing, copying, or otherwise using this Software, found at https://github.com/Ivsucram/ATL_Matlab, you agree to be bound by the terms of this NTUITIVE-LA. If you do not agree, do not install copy or use the Software. The Software is protected by copyright and other intellectual property laws and is licensed, not sold. If you wish to obtain a commercial royalty bearing license to this software please contact us at marcus.decarvalho@ntu.edu.sg.
%
% SCOPE OF RIGHTS:
% You may use, copy, reproduce, and distribute this Software for any non-commercial purpose, subject to the restrictions in this NTUITIVE-LA. Some purposes which can be non-commercial are teaching, academic research, public demonstrations and personal experimentation. You may also distribute this Software with books or other teaching materials, or publish the Software on websites, that are intended to teach the use of the Software for academic or other non-commercial purposes.
% You may not use or distribute this Software or any derivative works in any form for commercial purposes. Examples of commercial purposes would be running business operations, licensing, leasing, or selling the Software, distributing the Software for use with commercial products, using the Software in the creation or use of commercial products or any other activity which purpose is to procure a commercial gain to you or others.
% If the Software includes source code or data, you may create derivative works of such portions of the Software and distribute the modified Software for non-commercial purposes, as provided herein.
% If you distribute the Software or any derivative works of the Software, you will distribute them under the same terms and conditions as in this license, and you will not grant other rights to the Software or derivative works that are different from those provided by this NTUITIVE-LA.
% If you have created derivative works of the Software, and distribute such derivative works, you will cause the modified files to carry prominent notices so that recipients know that they are not receiving the original Software. Such notices must state: (i) that you have changed the Software; and (ii) the date of any changes.
%
% You may not distribute this Software or any derivative works.
% In return, we simply require that you agree:
% 1. That you will not remove any copyright or other notices from the Software.
% 2. That if any of the Software is in binary format, you will not attempt to modify such portions of the Software, or to reverse engineer or decompile them, except and only to the extent authorized by applicable law.
% 3. That NTUITIVE is granted back, without any restrictions or limitations, a non-exclusive, perpetual, irrevocable, royalty-free, assignable and sub-licensable license, to reproduce, publicly perform or display, install, use, modify, post, distribute, make and have made, sell and transfer your modifications to and/or derivative works of the Software source code or data, for any purpose.
% 4. That any feedback about the Software provided by you to us is voluntarily given, and NTUITIVE shall be free to use the feedback as it sees fit without obligation or restriction of any kind, even if the feedback is designated by you as confidential.
% 5. THAT THE SOFTWARE COMES "AS IS", WITH NO WARRANTIES. THIS MEANS NO EXPRESS, IMPLIED OR STATUTORY WARRANTY, INCLUDING WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, ANY WARRANTY AGAINST INTERFERENCE WITH YOUR ENJOYMENT OF THE SOFTWARE OR ANY WARRANTY OF TITLE OR NON-INFRINGEMENT. THERE IS NO WARRANTY THAT THIS SOFTWARE WILL FULFILL ANY OF YOUR PARTICULAR PURPOSES OR NEEDS. ALSO, YOU MUST PASS THIS DISCLAIMER ON WHENEVER YOU DISTRIBUTE THE SOFTWARE OR DERIVATIVE WORKS.
% 6. THAT NEITHER NTUITIVE NOR NTU NOR ANY CONTRIBUTOR TO THE SOFTWARE WILL BE LIABLE FOR ANY DAMAGES RELATED TO THE SOFTWARE OR THIS NTUITIVE-LA, INCLUDING DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL OR INCIDENTAL DAMAGES, TO THE MAXIMUM EXTENT THE LAW PERMITS, NO MATTER WHAT LEGAL THEORY IT IS BASED ON. ALSO, YOU MUST PASS THIS LIMITATION OF LIABILITY ON WHENEVER YOU DISTRIBUTE THE SOFTWARE OR DERIVATIVE WORKS.
% 7. That we have no duty of reasonable care or lack of negligence, and we are not obligated to (and will not) provide technical support for the Software.
% 8. That if you breach this NTUITIVE-LA or if you sue anyone over patents that you think may apply to or read on the Software or anyone's use of the Software, this NTUITIVE-LA (and your license and rights obtained herein) terminate automatically. Upon any such termination, you shall destroy all of your copies of the Software immediately. Sections 3, 4, 5, 6, 7, 8, 11 and 12 of this NTUITIVE-LA shall survive any termination of this NTUITIVE-LA.
% 9. That the patent rights, if any, granted to you in this NTUITIVE-LA only apply to the Software, not to any derivative works you make.
% 10. That the Software may be subject to U.S. export jurisdiction at the time it is licensed to you, and it may be subject to additional export or import laws in other places. You agree to comply with all such laws and regulations that may apply to the Software after delivery of the software to you.
% 11. That all rights not expressly granted to you in this NTUITIVE-LA are reserved.
% 12. That this NTUITIVE-LA shall be construed and controlled by the laws of the Republic of Singapore without regard to conflicts of law. If any provision of this NTUITIVE-LA shall be deemed unenforceable or contrary to law, the rest of this NTUITIVE-LA shall remain in full effect and interpreted in an enforceable manner that most nearly captures the intent of the original language.
%
% Copyright (c) NTUITIVE. All rights reserved.
classdef Util
methods
function [mean, var, std] = recursiveMeanStd(~, x, oldMean, oldVar, n)
%TODO DOC
%http://www.scalaformachinelearning.com/2015/10/recursive-mean-and-standard-deviation.html
%mean = (1 - 1/n)*oldMean + (oldMean./n);
mean = oldMean + (x - oldMean)./n;
var = oldVar + (x - oldMean) .*(x - mean);
std = sqrt(var/n);
end
function p = probit(~, mean, standardDeviation)
%probit TODO documentation
% calculate probit function which has xi^2 = pi/8
p = (1 + pi .* (standardDeviation .^ 2) ./ 8);
p = mean ./ sqrt(p);
end
function dist=KLDiv(~, P,Q)
% dist = KLDiv(P,Q) Kullback-Leibler divergence of two discrete probability
% distributions
% P and Q are automatically normalised to have the sum of one on rows
% have the length of one at each
% P = n x nbins
% Q = 1 x nbins or n x nbins(one to one)
% dist = n x 1
if size(P,2)~=size(Q,2)
error('the number of columns in P and Q should be the same');
end
if sum(~isfinite(P(:))) + sum(~isfinite(Q(:)))
error('the inputs contain non-finite values!')
end
% normalizing the P and Q
if size(Q,1)==1
Q = Q ./sum(Q);
P = P ./repmat(sum(P,2),[1 size(P,2)]);
temp = P.*log(P./repmat(Q,[size(P,1) 1]));
temp(isnan(temp))=0;% resolving the case when P(i)==0
dist = sum(temp,2);
elseif size(Q,1)==size(P,1)
Q = Q ./repmat(sum(Q,2),[1 size(Q,2)]);
P = P ./repmat(sum(P,2),[1 size(P,2)]);
temp = P.*log(P./Q);
temp(isnan(temp))=0; % resolving the case when P(i)==0
dist = sum(temp,2);
end
end
end
end

87
ATL/prepare_datasets.py Normal file
View File

@ -0,0 +1,87 @@
import sys
import os
sys.path.insert(0, '../../')
import ACDCDataManipulator as acdc
def generate_source(dataset_name, filename):
if not os.path.isfile(filename + '_source.csv'):
acdc.generate_csv_from_dataset(dataset_name, 5, True, False, 1)
os.rename('source.csv', filename + '_source.csv')
def generate_target(dataset_name, filename):
if not os.path.isfile(filename + '_target.csv'):
acdc.generate_csv_from_dataset(dataset_name, 7, False, False, 1)
os.rename('target.csv', filename + '_target.csv')
def generate_source_and_target(source_dataset, target_dataset, filename):
generate_source(source_dataset, filename)
generate_target(target_dataset, filename)
filename = 'usps_mnist'
generate_source_and_target('usps-16','mnist-16',filename)
filename = 'mnist_usps'
generate_source_and_target('mnist-28','usps-28',filename)
filename = 'amazon_review_beauty_luxury'
generate_source_and_target('amazon-review-all-beauty','amazon-review-luxury-beauty',filename)
filename = 'amazon_review_beauty_magazine'
generate_source_and_target('amazon-review-all-beauty','amazon-review-magazine-subscription',filename)
filename = 'amazon_review_beauty_books'
generate_source_and_target('amazon-review-all-beauty','amazon-review-books',filename)
filename = 'amazon_review_beauty_industrial'
generate_source_and_target('amazon-review-all-beauty','amazon-review-industrial-scientific',filename)
filename = 'amazon_review_luxury_beauty'
generate_source_and_target('amazon-review-luxury-beauty','amazon-review-all-beauty',filename)
filename = 'amazon_review_luxury_magazine'
generate_source_and_target('amazon-review-luxury-beauty','amazon-review-magazine-subscription',filename)
filename = 'amazon_review_luxury_books'
generate_source_and_target('amazon-review-luxury-beauty','amazon-review-books',filename)
filename = 'amazon_review_luxury_industrial'
generate_source_and_target('amazon-review-luxury-beauty','amazon-review-industrial-scientific',filename)
filename = 'amazon_review_books_beauty'
generate_source_and_target('amazon-review-books','amazon-review-all-beauty',filename)
filename = 'amazon_review_books_luxury'
generate_source_and_target('amazon-review-books','amazon-review-luxury-beauty',filename)
filename = 'amazon_review_books_magazine'
generate_source_and_target('amazon-review-books','amazon-review-magazine-subscription',filename)
filename = 'amazon_review_books_industrial'
generate_source_and_target('amazon-review-books','amazon-review-industrial-scientific',filename)
filename = 'amazon_review_industrial_beauty'
generate_source_and_target('amazon-review-industrial-scientific','amazon-review-all-beauty',filename)
filename = 'amazon_review_industrial_luxury'
generate_source_and_target('amazon-review-industrial-scientific','amazon-review-luxury-beauty',filename)
filename = 'amazon_review_industrial_magazine'
generate_source_and_target('amazon-review-industrial-scientific','amazon-review-magazine-subscription',filename)
filename = 'amazon_review_industrial_books'
generate_source_and_target('amazon-review-industrial-scientific','amazon-review-books',filename)
filename = 'amazon_review_magazine_beauty'
generate_source_and_target('amazon-review-magazine-subscription','amazon-review-all-beauty',filename)
filename = 'amazon_review_magazine_luxury'
generate_source_and_target('amazon-review-magazine-subscription','amazon-review-luxury-beauty',filename)
filename = 'amazon_review_magazine_industrial'
generate_source_and_target('amazon-review-magazine-subscription','amazon-review-industrial-scientific',filename)
filename = 'amazon_review_magazine_books'
generate_source_and_target('amazon-review-magazine-subscription','amazon-review-books',filename)

Binary file not shown.

BIN
ATL/results_run_1.rar Normal file

Binary file not shown.

Binary file not shown.

BIN
ATL/results_run_2.rar Normal file

Binary file not shown.

Binary file not shown.

BIN
ATL/results_run_3.rar Normal file

Binary file not shown.

Binary file not shown.

BIN
ATL/results_run_4.rar Normal file

Binary file not shown.

Binary file not shown.

BIN
ATL/results_run_5.rar Normal file

Binary file not shown.

81
FUSION/README.md Normal file
View File

@ -0,0 +1,81 @@
# FUSION
Efficient Multistream Classification using Direct DensIty Ratio Estimation
## Synopsis
Traditional data stream classification assumes that data is generated from a single non-stationary process. On the contrary, multistream classification problem involves two independent non-stationary data generating processes. One of them is the source stream that continuously generates labeled data. The other one is the target stream that generates unlabeled test data from the same domain. The distributions represented by the source stream data is biased compared to that of the target stream. Moreover, these streams may have asynchronous concept drifts between them. The multistream classification problem is to predict the class labels of target stream instances, while utilizing labeled data available on the source stream. This kind of scenario is often observed in real-world applications due to scarcity of labeled data. FUSION provides an efficient solution for multistream classification by fusing drift detection into online data adaptation. Theoretical analysis and experiment results show its effectiveness. Please refer to the paper (mentioned in the reference section) for further details.
## Requirements
FUSION requires that-
* Input file will be provided in a ARFF/CSV format.
* All the features need to be numeric. If there is a non-numeric featues, those can be converted to numeric features using standard techniques.
* Features should be normalized to get better performance.
## Environment
* Python 2.7
* Scipy, sklearn
* numpy, math
## Execution
To execute the program:
1. First set properties in the config.properties file. Available options have been discussed later in this file.
2. Call the main function in the multistream.py file with two parameters. The first parameter is the path to the dataset file without extension. Extension is automatically appended from the corresponding property in the config.property file. The second parameter is the probability that the next instance will come from the source stream. As an example, the second parameter value 0.1 means that the next instance will come from the source stream with 10% probability and from the target stream with 90% probability.
## Properties:
* baseDir
* Path to the base directory, which contains the input file(s). This will be appended to the name of the input file for getting the input file path.
* srcfileAppend
* This string is appeneded after the name of input file supplied as the first parameter to get the file name for the source stream in the baseDir location.
* trgfileAppend
* This string is appeneded after the name of input file supplied as the first parameter to get the file name for the target stream in the baseDir location.
* useKliepCVSigma
* 1: Use the cross-validated value for sigma; 0: Use a fixed value for sigma.
* kliepDefSigma
* In case useKliepCVSigma=0 was used, the value for sigma is specified in this property.
* kliepParEta
* Value for the parameter Eta.
* kliepParLambda
* Value for the parameter lambda.
* kliepParB
* Value for the parameter B.
* kliepParThreshold
* Value for the threshold used in the change detection algorithm.
* useSvmCVParams
* If set, find the parameters for SVM using cross-validation.
* svmDefGamma
* Default value for the gamma parameter in SVM.
* svmDefC
* Default value for the parameter "C" in SVM.
* kernel
* Type of kernel used in the svm algorithm.
* cushion
* The value of cushion for the change detection algorithm if not calculated by gamma.
* sensitivity
* Sensitivity of the change detection algorithm.
* maxWindowSize
* Size of the source and target sliding window.
* initialDataSize
* Size of the initial/warm-up training data.
* enableForceUpdate
* If set, update the classifier after a long period of time even if there is no change detected.
* forceUpdatePeriod
* If enableForceUpdate is set, the classifier is updated after this many instances even if there is no change detected.
* ensemble_size
* Size of the ensemble.
* output_file_name
* Path to the output file.
* logfile
* Path to the log file.
* tempDir
* Path to the directory containing all the temporary files.
## Output
### Console output
* The program shows progress or any change point detected in console.
* At the end, it reports the overall accuracy.
### File output
1. A log file is generated in the location specified by "logfile" property, which contains important debug information.
2. The output file contains the running average accuracy.
## Reference
[FUSION: An Online Method for Multistream Classification](https://dl.acm.org/citation.cfm?id=3132886&dl=ACM&coll=DL&CFID=1020200191&CFTOKEN=12773057)

BIN
FUSION/Results_run_1.rar Normal file

Binary file not shown.

BIN
FUSION/Results_run_2.rar Normal file

Binary file not shown.

BIN
FUSION/Results_run_3.rar Normal file

Binary file not shown.

BIN
FUSION/Results_run_4.rar Normal file

Binary file not shown.

BIN
FUSION/Results_run_5.rar Normal file

Binary file not shown.

150
FUSION/changedetection.py Normal file
View File

@ -0,0 +1,150 @@
from properties import Properties
import math, numpy as np
from scipy.stats import beta, binom
from decimal import Decimal
import sys, random, time
class ChangeDetection(object):
def __init__(self, gamma, sensitivity, maxWindowSize):
self.gamma = gamma
self.sensitivity = sensitivity
self.maxWindowSize = maxWindowSize
"""
Functions to estimate beta distribution parameters
"""
def __calcBetaDistAlpha(self, list, sampleMean, sampleVar):
if sampleMean == -1:
sampleMean = np.mean(list)
if sampleVar == -1:
sampleVar = np.var(list)
c = (sampleMean * (1-sampleMean)/sampleVar) - 1
return sampleMean * c
def __calcBetaDistBeta(self, list, alphaChange, sampleMean):
if sampleMean == -1:
sampleMean = np.mean(list)
return alphaChange * ((1.0/sampleMean) - 1)
"""
input: The dynamic sliding window containing confidence of target classifier
output: -1 if no change found, otherwise the change point
"""
def detectTargetChange(self, slidingWindow):
estimatedChangePoint = -1
N = len(slidingWindow)
cushion = max(Properties.CUSHION, int(math.floor(N ** self.gamma)))
#If mean confidence fall below 0.3, must retrain the classifier, so return a changepoint
if N > self.maxWindowSize:
Properties.logger.info('Current target Window Size is: ' + str(N) + ', which exceeds max limit, so update classifier')
return 0
if N > 2*cushion and np.mean(slidingWindow[0:N]) <= Properties.CONFCUTOFF:
Properties.logger.info('Current target Window Size is: ' + str(N))
Properties.logger.info('But overall confidence fell below ' + str(Properties.CONFCUTOFF) + ', so update classifier')
return 0
threshold = -math.log(self.sensitivity)
w = 0.0
kAtMaxW = -1
kindex = np.arange(cushion, N - cushion + 1)
for k in kindex:
xbar0 = np.mean(slidingWindow[:k])
var0 = np.var(slidingWindow[:k])
xbar1 = np.mean(slidingWindow[k:])
var1 = np.var(slidingWindow[k:])
if xbar1 <= 0.9*xbar0:
skn = 0.0
alphaPreChange = self.__calcBetaDistAlpha(slidingWindow[:k], xbar0, var0)
betaPreChange = self.__calcBetaDistBeta(slidingWindow[:k], alphaPreChange, xbar0)
alphaPostChange = self.__calcBetaDistAlpha(slidingWindow[k:], xbar1, var1)
betaPostChange = self.__calcBetaDistBeta(slidingWindow[k:], alphaPostChange, xbar1)
try:
swin = map(float, slidingWindow[k:])
denom = [beta.pdf(s, alphaPreChange, betaPreChange) for s in swin]
numer = [beta.pdf(s, alphaPostChange, betaPostChange) for s in swin]
nor_denom = np.array([1e-50 if (h-0)<1e-50 else h for h in denom])
l_ratios = numer/nor_denom
l_ratios_no_zeros = np.array([1e-50 if (h-0)<1e-50 else h for h in l_ratios])
ll_ratios = np.log(l_ratios_no_zeros)
skn = sum(ll_ratios)
except:
e = sys.exc_info()
print str(e[1])
raise Exception('Error in calculating skn')
if skn > w:
w = skn
kAtMaxW = k
if w >= threshold and kAtMaxW != -1:
estimatedChangePoint = kAtMaxW
Properties.logger.info('Estimated change point is ' + str(estimatedChangePoint) + ', detected at ' + str(N))
return estimatedChangePoint
"""
input: The dynamic sliding window containing accuracy of source classifier
output: -1 if no change found, otherwise the change point
"""
def detectSourceChange(self, slidingWindow):
estimatedChangePoint = -1
N = len(slidingWindow)
cushion = max(Properties.CUSHION, int(math.floor(N ** self.gamma)))
#If mean confidence fall below 0.3, must retrain the classifier, so return a changepoint
if N > self.maxWindowSize:
Properties.logger.info('Current target Window Size is: ' + str(N) + ', which exceeds max limit, so update classifier')
return 0
if N > 2*cushion and np.mean(slidingWindow) <= Properties.CONFCUTOFF:
Properties.logger.info('Current target Window Size is: ' + str(N))
Properties.logger.info('But overall confidence fell below ' + str(Properties.CONFCUTOFF) + ', so update classifier')
return 0
threshold = -math.log(self.sensitivity)
w = 0.0
kAtMaxW = -1
kindex = np.arange(cushion, N - cushion + 1)
for k in kindex:
xbar0 = np.mean(slidingWindow[:k])
xbar1 = np.mean(slidingWindow[k:])
# means should set 1=accurate, 0=erroneous
if xbar1 <= 0.9*xbar0:
skn = 0.0
try:
swin = map(float, slidingWindow[k:])
denom = [binom.pmf(s, k, xbar0) for s in swin]
numer = [binom.pmf(s, N-k, xbar1) for s in swin]
nor_denom = np.array([1e-50 if (h - 0) < 1e-50 else h for h in denom])
l_ratios = numer/nor_denom
l_ratios_no_zeros = np.array([1e-50 if (h-0)<1e-50 else h for h in l_ratios])
ll_ratios = np.log(l_ratios_no_zeros)
skn = sum(ll_ratios)
except:
e = sys.exc_info()
print str(e[1])
raise Exception('Error in calculating skn')
if skn > w:
w = skn
kAtMaxW = k
if w >= threshold and kAtMaxW != -1:
estimatedChangePoint = kAtMaxW
Properties.logger.info('Estimated change point is ' + str(estimatedChangePoint) + ', detected at: ' + str(N))
Properties.logger.info('Value of w: ' + str(w) + ', Value of Threshold: ' + str(threshold))
return estimatedChangePoint

26
FUSION/config.properties Normal file
View File

@ -0,0 +1,26 @@
baseDir=
srcfileAppend=_source.csv
trgfileAppend=_target.csv
useKliepCVSigma=1
kliepDefSigma=0.01
kliepParEta=1
kliepParLambda=0.01
kliepParB=800
kliepParThreshold=5
useSvmCVParams=0
svmDefGamma=0.0001
svmDefC=131072
gamma=0.5
kernel=rbf
cushion=100
sensitivity=0.0001
maxWindowSize=800
initialDataSize=10
enableForceUpdate=1
forceUpdatePeriod=3000
ensemble_size=1
confthreshold=0.9
confcutoff=0.5
output_file_name=result.out
logfile=multistream.log
tempDir=temp/

3
FUSION/debug.log Normal file
View File

@ -0,0 +1,3 @@
[1216/110931.086:ERROR:directory_reader_win.cc(43)] FindFirstFile: The system cannot find the path specified. (0x3)
[1217/110930.796:ERROR:directory_reader_win.cc(43)] FindFirstFile: The system cannot find the path specified. (0x3)
[1218/110930.411:ERROR:directory_reader_win.cc(43)] FindFirstFile: The system cannot find the path specified. (0x3)

239
FUSION/ensemble.py Normal file
View File

@ -0,0 +1,239 @@
import math
from model import Model
from properties import Properties
class Ensemble(object):
def __init__(self, ensemble_size):
self.models = []
self.size = ensemble_size
"""
Update weights for all models in the ensemble.
"""
def updateWeight(self, data, isSource):
for m in self.models:
m.computeModelWeight(data, isSource, Properties.MAXVAR)
def reEvalModelWeights(self, data, maxvar):
for i in range(0, len(self.models)):
self.models[i].weight = self.models[i].computeModelWeightKLIEP(data, maxvar)
"""
Adding a new model to the Ensemble.
Returns the index of the Ensemble array where the model is added.
"""
def __addModelKLIEP(self, model, data, maxvar):
index = 0
self.reEvalModelWeights(data, maxvar)
if len(self.models) < self.size:
self.models.append(model)
index = len(self.models)-1
else:
#replace least desirable model
index = self.__getLeastDesirableModelKLIEP()
if self.models[index].weight < model.weight:
Properties.logger.info('Least desirable model removed at ' + str(index))
self.models[index] = model
else:
Properties.logger.info('New model was not added as its weight is less than all of the existing models')
return -1
return index
"""
Adding a new model to the Ensemble.
Returns the index of the Ensemble array where the model is added.
"""
def __addModel(self, model):
index = 0
if len(self.models) < self.size:
self.models.append(model)
index = len(self.models)-1
else:
#replace least desirable model
index = self.__getLeastDesirableModel()
Properties.logger.info('Least desirable model removed at ' + str(index))
self.models[index] = model
return index
"""
Compute the least desirable model to be replaced when the ensemble size has reached its limit.
Least desirable is one having least target weight
Returns the array index of the least desired model.
"""
def __getLeastDesirableModelKLIEP(self):
weights = {}
for i in xrange(len(self.models)):
weights[i] = self.models[i].weight
keys = sorted(weights, key=weights.get)
return keys[0]
"""
Compute the least desirable model to be replaced when the ensemble size has reached its limit.
Least desirable is one having least target weight, but not the largest source weight.
Returns the array index of the least desired model.
"""
def __getLeastDesirableModel(self):
sweights = {}
tweights = {}
for i in xrange(len(self.models)):
sweights[i] = self.models[i].sweight
tweights[i] = self.models[i].tweight
skeys = sorted(sweights, reverse=True, key=sweights.get)
tkeys = sorted(tweights, key=tweights.get)
# skeys = sweights.keys()
# tkeys = tweights.keys()
for i in xrange(len(skeys)):
if tkeys[i] == skeys[i]:
continue
else:
return tkeys[i]
return tkeys[0]
"""
Initiate the creation of appropriate model in the ensemble for given target data.
Also compute weights for the new model based on the current data.
"""
def generateNewModelKLIEP(self, srcData, srcLabels, trgData, weightSrcData, svmC, svmGamma, svmKernel):
model = Model()
if len(srcData) == 0 or len(trgData) == 0:
raise Exception('Source or Target stream should have some elements')
#Create new model
Properties.logger.info('Target model creation')
model.trainUsingKLIEPWeights(srcData, srcLabels, weightSrcData, Properties.MAXVAR, svmC, svmGamma, svmKernel)
#compute source and target weight
Properties.logger.info('Computing model weights')
model.weight = model.computeModelWeightKLIEP(trgData, Properties.MAXVAR)
#update ensemble
index = self.__addModelKLIEP(model, trgData, Properties.MAXVAR)
if index != -1:
Properties.logger.info('Ensemble updated at ' + str(index))
"""
Initiate the creation of appropriate model in the ensemble for given source or target data.
Also compute weights for the new model based on the current data.
"""
def generateNewModel(self, sourceData, targetData, isSource, useSvmCVParams, svmDefC, svmDefGamma):
model = Model()
if len(sourceData) == 0 or len(targetData) == 0:
raise Exception('Source or Target stream should have some elements')
#Create new model
if isSource:
Properties.logger.info('Source model creation')
model.train(sourceData, None, Properties.MAXVAR, useSvmCVParams, svmDefC, svmDefGamma)
else:
Properties.logger.info('Target model creation')
model.train(sourceData, targetData, Properties.MAXVAR, useSvmCVParams, svmDefC, svmDefGamma)
#compute source and target weight
Properties.logger.info('Computing model weights')
model.computeModelWeight(sourceData, True, Properties.MAXVAR)
model.computeModelWeight(targetData, False, Properties.MAXVAR)
#update ensemble
index = self.__addModel(model)
Properties.logger.info('Ensemble updated at ' + str(index))
"""
Get prediction for a given data instance from each model.
For source data: Ensemble prediction is 1 if maximum weighted vote class label matches true class label, else 0.
For target data: Ensemble prediction class with max weighted vote class label, and average (for all class) confidence measure.
"""
def evaluateEnsembleKLIEP(self, dataInstance):
confSum = {}
weightSum = {}
for m in self.models:
# test data instance in each model
predictedClass, confidence = m.test(dataInstance)
# gather result
if predictedClass[0] in confSum:
confSum[predictedClass[0]] += confidence[0]
weightSum[predictedClass[0]] += m.weight
else:
confSum[predictedClass[0]] = confidence[0]
weightSum[predictedClass[0]] = m.weight
# get maximum voted class label
classMax = 0.0
sorted(confSum, key=confSum.get, reverse=True)
classMax = confSum.keys()[0]
return [classMax, confSum[classMax]/len(self.models)]
"""
Get prediction for a given data instance from each model.
For source data: Ensemble prediction is 1 if maximum weighted vote class label matches true class label, else 0.
For target data: Ensemble prediction class with max weighted vote class label, and average (for all class) confidence measure.
"""
def evaluateEnsemble(self, dataInstance, isSource):
classSum = {}
for m in self.models:
#test data instance in each model
result = m.test([dataInstance], Properties.MAXVAR)
#gather result
if isSource:
if int(result[0][0]) in classSum:
classSum[int(result[0][0])] += m.sweight
else:
classSum[int(result[0][0])] = m.sweight
else:
if int(result[0][0]) in classSum:
classSum[int(result[0][0])] += result[0][1]
else:
classSum[int(result[0][0])] = result[0][1]
#get maximum voted sum class label
classMax = 0.0
sumMax = max(classSum.values())
for i in classSum:
if classSum[i] == sumMax:
classMax = i
if isSource:
#for source data, check true vs predicted class label
if classMax == dataInstance[-1]:
return [1, -1]
else:
return [0, -1]
else:
# for target data
return [classMax, sumMax/len(self.models)]
"""
Get summary of models in ensemble.
"""
def getEnsembleSummary(self):
summry = '************************* E N S E M B L E S U M M A R Y ************************\n'
summry += 'Ensemble has currently ' + str(len(self.models)) + ' models.\n'
for i in xrange(len(self.models)):
summry += 'Model' + str(i+1) + ': weights<' + str(self.models[i].weight) + '>\n'
return summry

BIN
FUSION/ensemble.pyc Normal file

Binary file not shown.

100
FUSION/environment.yml Normal file
View File

@ -0,0 +1,100 @@
name: FUSION
channels:
- defaults
dependencies:
- attrs=19.3.0=py_0
- backports=1.0=py_2
- backports.shutil_get_terminal_size=1.0.0=py27_2
- backports.shutil_which=3.5.2=py27_0
- backports_abc=0.5=py27h0ec6b72_0
- blas=1.0=mkl
- bleach=3.1.0=py27_0
- ca-certificates=2020.12.8=haa95532_0
- certifi=2020.6.20=pyhd3eb1b0_3
- colorama=0.4.3=py_0
- configparser=4.0.2=py27_0
- decorator=4.4.1=py_0
- defusedxml=0.6.0=py_0
- entrypoints=0.3=py27_0
- enum34=1.1.6=py27_1
- functools32=3.2.3.2=py27_1
- futures=3.3.0=py27_0
- icc_rt=2019.0.0=h0cc432a_1
- icu=58.2=h2aa20d9_1
- intel-openmp=2019.4=245
- ipaddress=1.0.23=py_0
- ipykernel=4.10.0=py27_0
- ipython=5.8.0=py27_0
- ipython_genutils=0.2.0=py27_0
- ipywidgets=7.5.1=py_0
- jinja2=2.10.3=py_0
- jpeg=9b=ha175dff_2
- jsonschema=3.0.2=py27_0
- jupyter=1.0.0=py27_7
- jupyter_client=5.3.4=py27_0
- jupyter_console=5.2.0=py27_1
- jupyter_core=4.6.1=py27_0
- libpng=1.6.37=h7a46e7a_0
- libsodium=1.0.16=h8b3e59e_0
- m2w64-gcc-libgfortran=5.3.0=6
- m2w64-gcc-libs=5.3.0=7
- m2w64-gcc-libs-core=5.3.0=7
- m2w64-gmp=6.1.0=2
- m2w64-libwinpthread-git=5.0.0.4634.697f757=2
- markupsafe=1.1.1=py27h0c8e037_0
- mistune=0.8.4=py27h0c8e037_0
- mkl=2019.4=245
- mkl-service=2.3.0=py27h0b88c2a_0
- mkl_fft=1.0.15=py27h44c1dab_0
- msys2-conda-epoch=20160418=1
- nbconvert=5.6.1=py27_0
- nbformat=4.4.0=py27_0
- notebook=5.7.8=py27_0
- numpy=1.16.5=py27h5fc8d92_0
- numpy-base=1.16.5=py27hb1d0314_0
- openssl=1.0.2u=h0c8e037_0
- pandas=0.24.2=py27hc56fc5f_0
- pandoc=2.2.3.2=0
- pandocfilters=1.4.2=py27_1
- pathlib2=2.3.5=py27_0
- pickleshare=0.7.5=py27_0
- pip=19.3.1=py27_0
- prometheus_client=0.7.1=py_0
- prompt_toolkit=1.0.15=py27h3a8ec6a_0
- pygments=2.5.2=py_0
- pyqt=5.6.0=py27h6e61f57_6
- pyrsistent=0.15.6=py27h0c8e037_0
- python=2.7.17=h930f6bb_0
- python-dateutil=2.8.1=py_0
- pytz=2020.4=pyhd3eb1b0_0
- pywin32=227=py27h0c8e037_0
- pywinpty=0.5.5=py27_1000
- pyzmq=18.1.0=py27hc56fc5f_0
- qt=5.6.2=vc9hc26998b_12
- qtconsole=4.6.0=py_1
- scandir=1.10.0=py27h0c8e037_0
- scikit-learn=0.20.3=py27hf381715_0
- scipy=1.2.1=py27h4c3ab11_0
- send2trash=1.5.0=py27_0
- setuptools=44.0.0=py27_0
- simplegeneric=0.8.1=py27_2
- singledispatch=3.4.0.3=py27h3f9d112_0
- sip=4.18.1=py27hc56fc5f_2
- six=1.13.0=py27_0
- sqlite=3.30.1=h0c8e037_0
- terminado=0.8.3=py27_0
- testpath=0.4.4=py_0
- tornado=5.1.1=py27h0c8e037_0
- traitlets=4.3.3=py27_0
- vc=9=h7299396_1
- vs2008_runtime=9.00.30729.1=hfaea7d5_1
- wcwidth=0.1.7=py27_0
- webencodings=0.5.1=py27_1
- wheel=0.33.6=py27_0
- widgetsnbextension=3.5.1=py27_0
- win_unicode_console=0.5=py27hc037021_0
- wincertstore=0.2=py27hf04cefb_0
- winpty=0.4.3=4
- zeromq=4.3.1=h2880e7c_3
- zlib=1.2.11=h3cc03e0_3
prefix: C:\Users\ivsuc\Miniconda3\envs\FUSION

10
FUSION/gaussianModel.py Normal file
View File

@ -0,0 +1,10 @@
class GaussianModel:
def __init__(self, alphah=None, refPoints=None):
self.alphah = alphah
self.refPoints = refPoints
def setAlpha(self, alphah):
self.alphah = alphah
def setRefPoints(self, refPoints):
self.refPoints = refPoints

BIN
FUSION/gaussianModel.pyc Normal file

Binary file not shown.

500
FUSION/grid.py Normal file
View File

@ -0,0 +1,500 @@
#!/usr/bin/env python
__all__ = ['find_parameters']
import os, sys, traceback, getpass, time, re
from threading import Thread
from subprocess import *
if sys.version_info[0] < 3:
from Queue import Queue
else:
from queue import Queue
telnet_workers = []
ssh_workers = []
nr_local_worker = 1
class GridOption:
def __init__(self, dataset_pathname, options):
dirname = os.path.dirname(__file__)
if sys.platform != 'win32':
self.svmtrain_pathname = os.path.join(dirname, 'libsvm-weights-3.20/svm-train')
self.gnuplot_pathname = '/usr/bin/gnuplot'
else:
# example for windows
self.svmtrain_pathname = os.path.join(dirname, r'libsvm-weights-3.20\windows\svm-train.exe')
# svmtrain_pathname = r'c:\Program Files\libsvm\windows\svm-train.exe'
self.gnuplot_pathname = r'c:\tmp\gnuplot\binary\pgnuplot.exe'
self.fold = 5
self.c_begin, self.c_end, self.c_step = -5, 15, 2
self.g_begin, self.g_end, self.g_step = 3, -15, -2
self.grid_with_c, self.grid_with_g = True, True
self.dataset_pathname = dataset_pathname
self.dataset_title = os.path.split(dataset_pathname)[1]
self.out_pathname = '{0}.out'.format(self.dataset_title)
self.png_pathname = '{0}.png'.format(self.dataset_title)
self.pass_through_string = ' '
self.resume_pathname = None
self.parse_options(options)
def parse_options(self, options):
if type(options) == str:
options = options.split()
i = 0
pass_through_options = []
while i < len(options):
if options[i] == '-log2c':
i = i + 1
if options[i] == 'null':
self.grid_with_c = False
else:
self.c_begin, self.c_end, self.c_step = map(float,options[i].split(','))
elif options[i] == '-log2g':
i = i + 1
if options[i] == 'null':
self.grid_with_g = False
else:
self.g_begin, self.g_end, self.g_step = map(float,options[i].split(','))
elif options[i] == '-v':
i = i + 1
self.fold = options[i]
elif options[i] in ('-c','-g'):
raise ValueError('Use -log2c and -log2g.')
elif options[i] == '-svmtrain':
i = i + 1
self.svmtrain_pathname = options[i]
elif options[i] == '-gnuplot':
i = i + 1
if options[i] == 'null':
self.gnuplot_pathname = None
else:
self.gnuplot_pathname = options[i]
elif options[i] == '-out':
i = i + 1
if options[i] == 'null':
self.out_pathname = None
else:
self.out_pathname = options[i]
elif options[i] == '-png':
i = i + 1
self.png_pathname = options[i]
elif options[i] == '-resume':
if i == (len(options)-1) or options[i+1].startswith('-'):
self.resume_pathname = self.dataset_title + '.out'
else:
i = i + 1
self.resume_pathname = options[i]
else:
pass_through_options.append(options[i])
i = i + 1
self.pass_through_string = ' '.join(pass_through_options)
if not os.path.exists(self.svmtrain_pathname):
raise IOError('svm-train executable not found')
if not os.path.exists(self.dataset_pathname):
raise IOError('dataset not found')
if self.resume_pathname and not os.path.exists(self.resume_pathname):
raise IOError('file for resumption not found')
if not self.grid_with_c and not self.grid_with_g:
raise ValueError('-log2c and -log2g should not be null simultaneously')
if self.gnuplot_pathname and not os.path.exists(self.gnuplot_pathname):
sys.stderr.write('gnuplot executable not found\n')
self.gnuplot_pathname = None
def redraw(db,best_param,gnuplot,options,tofile=False):
if len(db) == 0: return
begin_level = round(max(x[2] for x in db)) - 3
step_size = 0.5
best_log2c,best_log2g,best_rate = best_param
# if newly obtained c, g, or cv values are the same,
# then stop redrawing the contour.
if all(x[0] == db[0][0] for x in db): return
if all(x[1] == db[0][1] for x in db): return
if all(x[2] == db[0][2] for x in db): return
if tofile:
gnuplot.write(b"set term png transparent small linewidth 2 medium enhanced\n")
gnuplot.write("set output \"{0}\"\n".format(options.png_pathname.replace('\\','\\\\')).encode())
#gnuplot.write(b"set term postscript color solid\n")
#gnuplot.write("set output \"{0}.ps\"\n".format(options.dataset_title).encode().encode())
elif sys.platform == 'win32':
gnuplot.write(b"set term windows\n")
else:
gnuplot.write( b"set term x11\n")
gnuplot.write(b"set xlabel \"log2(C)\"\n")
gnuplot.write(b"set ylabel \"log2(gamma)\"\n")
gnuplot.write("set xrange [{0}:{1}]\n".format(options.c_begin,options.c_end).encode())
gnuplot.write("set yrange [{0}:{1}]\n".format(options.g_begin,options.g_end).encode())
gnuplot.write(b"set contour\n")
gnuplot.write("set cntrparam levels incremental {0},{1},100\n".format(begin_level,step_size).encode())
gnuplot.write(b"unset surface\n")
gnuplot.write(b"unset ztics\n")
gnuplot.write(b"set view 0,0\n")
gnuplot.write("set title \"{0}\"\n".format(options.dataset_title).encode())
gnuplot.write(b"unset label\n")
gnuplot.write("set label \"Best log2(C) = {0} log2(gamma) = {1} accuracy = {2}%\" \
at screen 0.5,0.85 center\n". \
format(best_log2c, best_log2g, best_rate).encode())
gnuplot.write("set label \"C = {0} gamma = {1}\""
" at screen 0.5,0.8 center\n".format(2**best_log2c, 2**best_log2g).encode())
gnuplot.write(b"set key at screen 0.9,0.9\n")
gnuplot.write(b"splot \"-\" with lines\n")
db.sort(key = lambda x:(x[0], -x[1]))
prevc = db[0][0]
for line in db:
if prevc != line[0]:
gnuplot.write(b"\n")
prevc = line[0]
gnuplot.write("{0[0]} {0[1]} {0[2]}\n".format(line).encode())
gnuplot.write(b"e\n")
gnuplot.write(b"\n") # force gnuplot back to prompt when term set failure
gnuplot.flush()
def calculate_jobs(options):
def range_f(begin,end,step):
# like range, but works on non-integer too
seq = []
while True:
if step > 0 and begin > end: break
if step < 0 and begin < end: break
seq.append(begin)
begin = begin + step
return seq
def permute_sequence(seq):
n = len(seq)
if n <= 1: return seq
mid = int(n/2)
left = permute_sequence(seq[:mid])
right = permute_sequence(seq[mid+1:])
ret = [seq[mid]]
while left or right:
if left: ret.append(left.pop(0))
if right: ret.append(right.pop(0))
return ret
c_seq = permute_sequence(range_f(options.c_begin,options.c_end,options.c_step))
g_seq = permute_sequence(range_f(options.g_begin,options.g_end,options.g_step))
if not options.grid_with_c:
c_seq = [None]
if not options.grid_with_g:
g_seq = [None]
nr_c = float(len(c_seq))
nr_g = float(len(g_seq))
i, j = 0, 0
jobs = []
while i < nr_c or j < nr_g:
if i/nr_c < j/nr_g:
# increase C resolution
line = []
for k in range(0,j):
line.append((c_seq[i],g_seq[k]))
i = i + 1
jobs.append(line)
else:
# increase g resolution
line = []
for k in range(0,i):
line.append((c_seq[k],g_seq[j]))
j = j + 1
jobs.append(line)
resumed_jobs = {}
if options.resume_pathname is None:
return jobs, resumed_jobs
for line in open(options.resume_pathname, 'r'):
line = line.strip()
rst = re.findall(r'rate=([0-9.]+)',line)
if not rst:
continue
rate = float(rst[0])
c, g = None, None
rst = re.findall(r'log2c=([0-9.-]+)',line)
if rst:
c = float(rst[0])
rst = re.findall(r'log2g=([0-9.-]+)',line)
if rst:
g = float(rst[0])
resumed_jobs[(c,g)] = rate
return jobs, resumed_jobs
class WorkerStopToken: # used to notify the worker to stop or if a worker is dead
pass
class Worker(Thread):
def __init__(self,name,job_queue,result_queue,options):
Thread.__init__(self)
self.name = name
self.job_queue = job_queue
self.result_queue = result_queue
self.options = options
def run(self):
while True:
(cexp,gexp) = self.job_queue.get()
if cexp is WorkerStopToken:
self.job_queue.put((cexp,gexp))
# print('worker {0} stop.'.format(self.name))
break
try:
c, g = None, None
if cexp != None:
c = 2.0**cexp
if gexp != None:
g = 2.0**gexp
rate = self.run_one(c,g)
if rate is None: raise RuntimeError('get no rate')
except:
# we failed, let others do that and we just quit
traceback.print_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])
self.job_queue.put((cexp,gexp))
sys.stderr.write('worker {0} quit.\n'.format(self.name))
break
else:
self.result_queue.put((self.name,cexp,gexp,rate))
def get_cmd(self,c,g):
options=self.options
cmdline = '"' + options.svmtrain_pathname + '"'
if options.grid_with_c:
cmdline += ' -c {0} '.format(c)
if options.grid_with_g:
cmdline += ' -g {0} '.format(g)
cmdline += ' -v {0} {1} {2} '.format\
(options.fold,options.pass_through_string,options.dataset_pathname)
return cmdline
class LocalWorker(Worker):
def run_one(self,c,g):
cmdline = self.get_cmd(c,g)
result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout
for line in result.readlines():
if str(line).find('Cross') != -1:
return float(line.split()[-1][0:-1])
class SSHWorker(Worker):
def __init__(self,name,job_queue,result_queue,host,options):
Worker.__init__(self,name,job_queue,result_queue,options)
self.host = host
self.cwd = os.getcwd()
def run_one(self,c,g):
cmdline = 'ssh -x -t -t {0} "cd {1}; {2}"'.format\
(self.host,self.cwd,self.get_cmd(c,g))
result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout
for line in result.readlines():
if str(line).find('Cross') != -1:
return float(line.split()[-1][0:-1])
class TelnetWorker(Worker):
def __init__(self,name,job_queue,result_queue,host,username,password,options):
Worker.__init__(self,name,job_queue,result_queue,options)
self.host = host
self.username = username
self.password = password
def run(self):
import telnetlib
self.tn = tn = telnetlib.Telnet(self.host)
tn.read_until('login: ')
tn.write(self.username + '\n')
tn.read_until('Password: ')
tn.write(self.password + '\n')
# XXX: how to know whether login is successful?
tn.read_until(self.username)
#
print('login ok', self.host)
tn.write('cd '+os.getcwd()+'\n')
Worker.run(self)
tn.write('exit\n')
def run_one(self,c,g):
cmdline = self.get_cmd(c,g)
result = self.tn.write(cmdline+'\n')
(idx,matchm,output) = self.tn.expect(['Cross.*\n'])
for line in output.split('\n'):
if str(line).find('Cross') != -1:
return float(line.split()[-1][0:-1])
def find_parameters(dataset_pathname, options=''):
def update_param(c,g,rate,best_c,best_g,best_rate,worker,resumed):
if (rate > best_rate) or (rate==best_rate and g==best_g and c<best_c):
best_rate,best_c,best_g = rate,c,g
stdout_str = '[{0}] {1} {2} (best '.format\
(worker,' '.join(str(x) for x in [c,g] if x is not None),rate)
output_str = ''
if c != None:
stdout_str += 'c={0}, '.format(2.0**best_c)
output_str += 'log2c={0} '.format(c)
if g != None:
stdout_str += 'g={0}, '.format(2.0**best_g)
output_str += 'log2g={0} '.format(g)
stdout_str += 'rate={0})'.format(best_rate)
print(stdout_str)
if options.out_pathname and not resumed:
output_str += 'rate={0}\n'.format(rate)
result_file.write(output_str)
result_file.flush()
return best_c,best_g,best_rate
options = GridOption(dataset_pathname, options);
if options.gnuplot_pathname:
gnuplot = Popen(options.gnuplot_pathname,stdin = PIPE,stdout=PIPE,stderr=PIPE).stdin
else:
gnuplot = None
# put jobs in queue
jobs,resumed_jobs = calculate_jobs(options)
job_queue = Queue(0)
result_queue = Queue(0)
for (c,g) in resumed_jobs:
result_queue.put(('resumed',c,g,resumed_jobs[(c,g)]))
for line in jobs:
for (c,g) in line:
if (c,g) not in resumed_jobs:
job_queue.put((c,g))
# hack the queue to become a stack --
# this is important when some thread
# failed and re-put a job. It we still
# use FIFO, the job will be put
# into the end of the queue, and the graph
# will only be updated in the end
job_queue._put = job_queue.queue.appendleft
# fire telnet workers
if telnet_workers:
nr_telnet_worker = len(telnet_workers)
username = getpass.getuser()
password = getpass.getpass()
for host in telnet_workers:
worker = TelnetWorker(host,job_queue,result_queue,
host,username,password,options)
worker.start()
# fire ssh workers
if ssh_workers:
for host in ssh_workers:
worker = SSHWorker(host,job_queue,result_queue,host,options)
worker.start()
# fire local workers
for i in range(nr_local_worker):
worker = LocalWorker('local',job_queue,result_queue,options)
worker.start()
# gather results
done_jobs = {}
if options.out_pathname:
if options.resume_pathname:
result_file = open(options.out_pathname, 'a')
else:
result_file = open(options.out_pathname, 'w')
db = []
best_rate = -1
best_c,best_g = None,None
for (c,g) in resumed_jobs:
rate = resumed_jobs[(c,g)]
best_c,best_g,best_rate = update_param(c,g,rate,best_c,best_g,best_rate,'resumed',True)
for line in jobs:
for (c,g) in line:
while (c,g) not in done_jobs:
(worker,c1,g1,rate1) = result_queue.get()
done_jobs[(c1,g1)] = rate1
if (c1,g1) not in resumed_jobs:
best_c,best_g,best_rate = update_param(c1,g1,rate1,best_c,best_g,best_rate,worker,False)
db.append((c,g,done_jobs[(c,g)]))
if gnuplot and options.grid_with_c and options.grid_with_g:
redraw(db,[best_c, best_g, best_rate],gnuplot,options)
redraw(db,[best_c, best_g, best_rate],gnuplot,options,True)
if options.out_pathname:
result_file.close()
job_queue.put((WorkerStopToken,None))
best_param, best_cg = {}, []
if best_c != None:
best_param['c'] = 2.0**best_c
best_cg += [2.0**best_c]
if best_g != None:
best_param['g'] = 2.0**best_g
best_cg += [2.0**best_g]
print('{0} {1}'.format(' '.join(map(str,best_cg)), best_rate))
return best_rate, best_param
if __name__ == '__main__':
def exit_with_help():
print("""\
Usage: grid.py [grid_options] [svm_options] dataset
grid_options :
-log2c {begin,end,step | "null"} : set the range of c (default -5,15,2)
begin,end,step -- c_range = 2^{begin,...,begin+k*step,...,end}
"null" -- do not grid with c
-log2g {begin,end,step | "null"} : set the range of g (default 3,-15,-2)
begin,end,step -- g_range = 2^{begin,...,begin+k*step,...,end}
"null" -- do not grid with g
-v n : n-fold cross validation (default 5)
-svmtrain pathname : set svm executable path and name
-gnuplot {pathname | "null"} :
pathname -- set gnuplot executable path and name
"null" -- do not plot
-out {pathname | "null"} : (default dataset.out)
pathname -- set output file path and name
"null" -- do not output file
-png pathname : set graphic output file path and name (default dataset.png)
-resume [pathname] : resume the grid task using an existing output file (default pathname is dataset.out)
This is experimental. Try this option only if some parameters have been checked for the SAME data.
svm_options : additional options for svm-train""")
sys.exit(1)
if len(sys.argv) < 2:
exit_with_help()
dataset_pathname = sys.argv[-1]
options = sys.argv[1:-1]
try:
find_parameters(dataset_pathname, options)
except (IOError,ValueError) as e:
sys.stderr.write(str(e) + '\n')
sys.stderr.write('Try "grid.py" for more information.\n')
sys.exit(1)

318
FUSION/kliep.py Normal file
View File

@ -0,0 +1,318 @@
from __future__ import division
import numpy as np
import math as m
class Kliep(object):
def __init__(self, kliepParEta, kliepParLambda, kliepParB, kliepParThreshold, kliepDefSigma=0.01):
self.kliepDefSigma = kliepDefSigma
self.kliepParEta = kliepParEta
self.kliepParLambda = kliepParLambda
self.kliepParB = kliepParB
self.kliepParThreshold = kliepParThreshold
def pdf_gaussian(self, x, mu):
x_size = np.shape(x)
d = x_size[0]
nx = x_size[len(x_size) - 1]
tmp = (x - np.tile(mu, (1, nx)))/(np.sqrt(2) * np.tile(self.kliepDefSigma, (1, nx)))
denom = m.pow((2 * m.pi), (-1 / 2))
px = np.exp(-np.power(tmp, 2, dtype='float64'))*(denom / self.kliepDefSigma)
return px
def kernel_Gaussian(self, x, c):
x_size = np.shape(x)
dx = x_size[0]
nx = x_size[len(x_size) - 1]
c_size = np.shape(c)
dc = c_size[0]
nc = c_size[len(c_size) - 1]
x2 = np.power(x, 2, dtype=np.float)
c2 = np.power(c, 2, dtype=np.float)
# if the array is 1D, need to add an axis first before doing transpose.
distance2 = np.tile(c2, (nx, 1)) + np.tile(x2.T, (1, nc)) - (2 * x.T * c)
X = np.exp(-distance2 / (2 * m.pow(self.kliepDefSigma, 2)), dtype='float64')
return X
"""
- find kernel gaussians for multi dim x
- in x and c, each column represents one data point
"""
def kernel_Gaussian_mdim(self, x, c):
x_size = np.shape(x)
dx = x_size[0]
nx = x_size[len(x_size) - 1]
c_size = np.shape(c)
dc = c_size[0]
nc = c_size[len(c_size) - 1]
distance2 = None
for i in range(0, nx):
# though we extract a column, it become a row matrix in python.
x_col_i = x[:, i]
dist_x_col_i_c = self.distance(x_col_i[np.newaxis].T, dx, 1, c, dc, nc)
if distance2 is None:
# since X will have more rows, so while copying need to add a new axis also.
distance2 = dist_x_col_i_c[np.newaxis]
else:
distance2 = np.append(distance2, dist_x_col_i_c[np.newaxis], axis=0)
X = np.exp(-distance2 / (2 * m.pow(self.kliepDefSigma, 2)), dtype='float64')
return X
def kernel_Gaussian_mdim_choose_sigma(self, x, c, sigma):
x_size = np.shape(x)
dx = x_size[0]
nx = x_size[len(x_size) - 1]
c_size = np.shape(c)
dc = c_size[0]
nc = c_size[len(c_size) - 1]
distance2 = None
for i in range(0, nx):
# though we extract a column, it become a row matrix in python.
x_col_i = x[:, i]
dist_x_col_i_c = self.distance(x_col_i[np.newaxis].T, dx, 1, c, dc, nc)
if distance2 is None:
# since X will have more rows, so while copying need to add a new axis also.
distance2 = dist_x_col_i_c[np.newaxis]
else:
distance2 = np.append(distance2, dist_x_col_i_c[np.newaxis], axis=0)
X = np.exp(-distance2/(2*m.pow(sigma, 2)), dtype='float64')
return X
"""
x_col_i represents ith instance in row matrix format.
c represents the selected test points. ith column represents ith selected test point.
distance returns a row matrix, dimension 1*c_ncol, where (1,i) element is the distance between the instance
represented by x_col_i and ith instance in c, i.e., ith column in c
"""
def distance(self, x_col_i, x_col_i_nrow, x_col_i_ncol, c, c_nrow, c_ncol):
dist_tmp = np.power(np.tile(x_col_i, (1, c_ncol)) - c, 2, dtype='float64')
# need to do column-wise sum
dist_2 = np.sum(dist_tmp, axis=0, dtype='float64')
return dist_2
def KLIEP_projection(self, alpha, Xte, meanDistSrcData, c):
# b_alpha = np.sum(b*alpha)
b_alpha = np.dot(meanDistSrcData.T, alpha)
alpha = alpha + meanDistSrcData * (1 - b_alpha) * np.linalg.pinv(c, rcond=1e-20)
# alpha = np.max(0,alpha[np.newaxis])
alpha[alpha < 0] = 0
b_alpha_new = np.dot(meanDistSrcData.T, alpha)
alpha = alpha * np.linalg.pinv(b_alpha_new, rcond=1e-20)
Xte_alpha = np.dot(Xte, alpha)
Xte_alpha[(Xte_alpha-0)<0.00001] = 0.00001
#Xte_alpha_no_zeros = np.array([(1/100) if (h - 0) < 0.00001 else h for h in Xte_alpha])
log_xte_alpha = np.log(Xte_alpha, dtype='float64')
score = np.mean(log_xte_alpha, dtype='float64')
return alpha, Xte_alpha, score
def KLIEP_projection_wo_score(self, alpha, meanDistSrcData, c):
b_alpha = np.dot(meanDistSrcData.T, alpha)
alpha = alpha + meanDistSrcData * (1 - b_alpha) * np.linalg.pinv(c, rcond=1e-20)
# alpha = np.max(0,alpha[np.newaxis])
alpha[alpha < 0] = 0
b_alpha_new = np.dot(meanDistSrcData.T, alpha)
alpha = alpha * np.linalg.pinv(b_alpha_new, rcond=1e-20)
return alpha
def KLIEP_learning(self, mean_X_de, X_nu):
X_nu_size = np.shape(X_nu)
n_nu = X_nu_size[0]
nc = X_nu_size[len(X_nu_size) - 1]
max_iteration = 100
epsilon_list = np.power(10, range(3, -4, -1), dtype='float64')
# c = sum(np.power(mean_X_de, 2, dtype=np.float))
c = np.dot(mean_X_de.T, mean_X_de)
alpha = np.ones((nc, 1))
[alpha, X_nu_alpha, score] = self.KLIEP_projection(alpha, X_nu, mean_X_de, c)
for epsilon in epsilon_list:
for iteration in range(1, max_iteration):
alpha_tmp = alpha + (epsilon * np.dot(X_nu.T, (1 / X_nu_alpha)))
[alpha_new, X_nu_alpha_new, score_new] = self.KLIEP_projection(alpha_tmp, X_nu, mean_X_de, c)
if (score_new - score) <= 0:
break
score = score_new
alpha = alpha_new
X_nu_alpha = X_nu_alpha_new
return alpha
def KLIEP(self, srcData, trgData):
srcDataSize = np.shape(srcData)
nRowSrcData = srcDataSize[0]
nColSrcData = srcDataSize[len(srcDataSize) - 1]
trgDataSize = np.shape(trgData)
nRowTrgData = trgDataSize[0]
nColTrgData = trgDataSize[len(trgDataSize) - 1]
b = min(self.kliepParB, nColTrgData)
#rand_index = np.random.permutation(nColTrgData)
# rand_index = genfromtxt('rand_index.csv', delimiter=',')-1
#refPoints = trgData[:, rand_index[0:b].tolist()]
refPoints = trgData[:, -b:]
######### Computing the final solution wh_x_de
kernelMatSrcData = self.kernel_Gaussian_mdim(srcData, refPoints)
kernelMatTrgData = self.kernel_Gaussian_mdim(trgData, refPoints)
meanDistSrcData = np.transpose(np.mean(kernelMatSrcData, 0)[np.newaxis])
alphah = self.KLIEP_learning(meanDistSrcData, kernelMatTrgData)
# wh_x_nu = np.transpose(np.dot(X_nu, alphah))
#weightTrgData = np.dot(kernelMatTrgData, alphah)
return alphah, kernelMatSrcData, kernelMatTrgData, refPoints
def chooseSigma(self, srcData, trgData, fold=5):
srcDataSize = np.shape(srcData)
nRowSrcData = srcDataSize[0]
nColSrcData = srcDataSize[len(srcDataSize) - 1]
trgDataSize = np.shape(trgData)
nRowTrgData = trgDataSize[0]
nColTrgData = trgDataSize[len(trgDataSize) - 1]
print "Choose Sigma"
####### Choosing Gaussian kernel center `x_ce'
# rand_index = np.random.permutation(n_nu)
b = min(self.kliepParB, nColTrgData)
# undo after finishing debug
# x_ce = np.array(x_nu)
# np.random.shuffle(x_ce)
rand_index = np.random.permutation(nColTrgData)
# rand_index = genfromtxt('rand_index.csv', delimiter=',')-1
refPoints = trgData[:, rand_index[0:b].tolist()]
####### Searching Gaussian kernel width `sigma_chosen'
sigma = 10
score = -float("inf")
epsilon_list = range(int(m.log10(sigma)) - 1, -2, -1)
for epsilon in epsilon_list:
for iteration in range(1, 10, 1):
sigma_new = sigma - m.pow(10, epsilon)
print "sigma = ", sigma, " epsilon=", epsilon, "sigma_new=", sigma_new
# undo after finishing debug
cv_index = np.random.permutation(nColTrgData)
# cv_index = genfromtxt('cv_index' + str(epsilon) + '_' + str(iteration) + '.csv', delimiter=',')-1
cv_split = np.floor(np.divide(np.multiply(range(0, nColTrgData), fold), nColTrgData)) + 1
score_new = 0
kernelMatSrcData = self.kernel_Gaussian_mdim_choose_sigma(srcData, refPoints, sigma_new)
kernelMatTrgData = self.kernel_Gaussian_mdim_choose_sigma(trgData, refPoints, sigma_new)
# axis = 0 means column-wise mean
meanDistSrcData = np.transpose(np.mean(kernelMatSrcData, axis=0)[np.newaxis])
for i in range(1, fold + 1, 1):
alpha_cv = self.KLIEP_learning(meanDistSrcData, kernelMatTrgData[cv_index[cv_split != i].tolist(), :])
wh_cv = np.dot(kernelMatTrgData[cv_index[cv_split == i].tolist(), :], alpha_cv)
score_new = score_new + (np.mean(np.log(wh_cv), dtype=np.float)/fold)
if (score_new - score) <= 0:
break
score = score_new
sigma = sigma_new
print "score=", score, " sigma=", sigma, "epsilon=", epsilon, "iteration=", iteration
print "Sigma = ", str(sigma)
return sigma
def changeDetection(self, trgData, refPointsOld, alphahOld, refPointsNew, alphahNew, kernelMatTrgDataNew=None):
if len(np.shape(trgData)) == 1:
trgData = trgData[np.newaxis]
if kernelMatTrgDataNew is None:
kernelMatTrgDataNew = self.kernel_Gaussian_mdim(trgData, refPointsNew)
kernelMatTrgDataOld = self.kernel_Gaussian_mdim(trgData, refPointsOld)
weightTrgDataNew = self.calcInstanceWeights(kernelMatTrgDataNew, alphahNew)
weightTrgDataNew[(weightTrgDataNew - 0) < 0.00001] = 0.00001
#weightTrgDataNew_no_zeros = np.array([float(0.0001) if (h-0)<0.00001 else h for h in weightTrgDataNew[0]])
weightTrgDataOld = self.calcInstanceWeights(kernelMatTrgDataOld, alphahOld)
weightTrgDataOld[(weightTrgDataOld - 0) < 0.00001] = 0.00001
#weightTrgDataOld_no_zeros = np.array([float(0.0001) if (h - 0) < 0.00001 else h for h in weightTrgDataOld[0]])
l_ratios = weightTrgDataNew/weightTrgDataOld
lnWeightTrgData = np.log(l_ratios, dtype='float64')
changeScore = np.sum(lnWeightTrgData, dtype='float64')
#print "ChangeScore=", changeScore
return changeScore > self.kliepParThreshold, changeScore, kernelMatTrgDataNew
"""
updateAlpha parameters:
srcData - contains instances from src stream
trgData - contains instances from trg stream, including the new point
newTrgPoint - is the new point, last column of trgData should match with newTrgPoint
alphah - most recent set of alpha
"""
def updateAlpha(self, srcData, trgData, newTrgPoint, refPoints, alphah, kernelMatSrcData=None):
if len(np.shape(srcData)) == 1:
srcData = srcData[np.newaxis]
if len(np.shape(trgData)) == 1:
trgData = trgData[np.newaxis]
# calculate c
trgDataSize = np.shape(trgData)
nRowTrgData = trgDataSize[0]
nColTrgData = trgDataSize[len(trgDataSize) - 1]
if newTrgPoint.ndim == 1:
newTrgPoint = newTrgPoint[np.newaxis]
kernelNewTrgPoint = self.kernel_Gaussian_mdim(newTrgPoint, refPoints)
# alphah is a column vector, each row of kernel_x_new represents distances for one data point
c = np.dot(kernelNewTrgPoint, alphah)
# update alpha values
tmp = 1 - (self.kliepParEta * self.kliepParLambda)
alphah = alphah * tmp
alphah = alphah[1:, :]
alphah = np.append(alphah, self.kliepParEta/c, axis=0)
alphah, kernelMatSrcData = self.satConstraints(srcData, trgData, refPoints, alphah, kernelMatSrcData)
return alphah, kernelMatSrcData
def satConstraints(self, srcData, trgData, refPoints, alphah, kernelMatSrcData=None):
trgDataSize = np.shape(trgData)
nRowTrgData = trgDataSize[0]
nColTrgData = trgDataSize[len(trgDataSize) - 1]
if kernelMatSrcData is None:
kernelMatSrcData = self.kernel_Gaussian_mdim(srcData, refPoints)
meanDistSrcData = self.colWiseMeanTransposed(kernelMatSrcData)
# c = sum(np.power(mean_X_de, 2, dtype=np.float))
c = np.dot(meanDistSrcData.T, meanDistSrcData)
alphah = self.KLIEP_projection_wo_score(alphah, meanDistSrcData, c)
return alphah, kernelMatSrcData
"""
returns transpose of matrix resulting from taking column wise mean of mat.
"""
def colWiseMeanTransposed(self, mat):
return np.transpose(np.mean(mat, 0)[np.newaxis])
"""
Returns instance weights as a row vector
"""
def calcInstanceWeights(self, kernelMat, alphah):
return np.dot(kernelMat, alphah).T

BIN
FUSION/kliep.pyc Normal file

Binary file not shown.

133
FUSION/kmm.py Normal file
View File

@ -0,0 +1,133 @@
import math, numpy, sklearn.metrics.pairwise as sk, sys
from sklearn import linear_model
from cvxopt import matrix, solvers
#DENSITY ESTIMATION
#KMM solving the quadratic programming problem to get betas (weights) for each training instance
def kmm(Xtrain, Xtest, sigma):
n_tr = len(Xtrain)
n_te = len(Xtest)
#calculate Kernel
print 'Computing kernel for training data ...'
K_ns = sk.rbf_kernel(Xtrain, Xtrain, sigma)
#make it symmetric
K = 0.5*(K_ns + K_ns.transpose())
#calculate kappa
print 'Computing kernel for kappa ...'
kappa_r = sk.rbf_kernel(Xtrain, Xtest, sigma)
ones = numpy.ones(shape=(n_te, 1))
kappa = numpy.dot(kappa_r, ones)
kappa = -(float(n_tr)/float(n_te)) * kappa
#calculate eps
eps = (math.sqrt(n_tr) - 1)/math.sqrt(n_tr)
#constraints
A0 = numpy.ones(shape=(1,n_tr))
A1 = -numpy.ones(shape=(1,n_tr))
A = numpy.vstack([A0, A1, -numpy.eye(n_tr), numpy.eye(n_tr)])
b = numpy.array([[n_tr*(eps+1), n_tr*(eps-1)]])
b = numpy.vstack([b.T, -numpy.zeros(shape=(n_tr,1)), numpy.ones(shape=(n_tr,1))*1000])
print 'Solving quadratic program for beta ...'
P = matrix(K, tc='d')
q = matrix(kappa, tc='d')
G = matrix(A, tc='d')
h = matrix(b, tc='d')
beta = solvers.qp(P,q,G,h)
return [i for i in beta['x']]
#KMM PARAMETER TUNING
#Train a linear regression model with Lasso (L1 regularization).
#Model parameter selection via cross validation
#Predict the target (Beta) for a given test dataset
def regression(XTrain, betaTrain, XTest):
model = linear_model.LassoCV(cv=10, alphas=[0.001,0.005,0.01,0.05,0.1,0.5,1,5,10])
model.fit(XTrain, betaTrain)
Beta = model.predict(XTest)
return [i for i in Beta]
#KMM PARAMETER TUNING
#Compute J score for parameter tuning of KMM
def computeJ(betaTrain, betaTest):
tr = sum([i ** 2 for i in betaTrain])
te = sum(betaTest)
return ((1/float(len(betaTrain)))*tr) - ((2/float(len(betaTest)))*te)
#I/O OPERATIONS
#Read input csv file
def getData(filename):
data = []
with open(filename) as f:
content = f.readlines()
for line in content:
line = line.strip()
data.append(map(float,line.split(",")))
return data
#I/O OPERATIONS
#Write Output to file
def writeFile(filename, data):
if len(data) == 0:
return
with open(filename, 'w') as f:
for i in data:
f.write(str(i) + '\n')
#MAIN ALGORITHM
#compute beta
def getBeta(traindata, testdata, gammab):
Jmin = 0
beta = []
for g in gammab:
betaTrain = kmm(traindata, testdata, g)
betaTest = regression(traindata, betaTrain, testdata)
J = computeJ(betaTrain,betaTest)
#print betaTrain
#print betaTest
#print J
if len(beta) == 0:
Jmin = J
beta = list(betaTrain)
elif Jmin > J:
Jmin = J
beta = list(betaTrain)
return beta
#MAIN METHOD
def main():
#traindata = [[1,2,3],[4,7,4],[3,3,3],[4,4,4],[5,5,5],[3,4,5],[1,2,3],[4,7,4],[3,3,3],[4,4,4],[5,5,5],[3,4,5],[1,2,3],[4,7,4],[3,3,3],[4,4,4],[5,5,5],[3,4,5],[1,2,3],[4,7,4],[3,3,3],[4,4,4],[5,5,5],[3,4,5]]
#testdata = [[5,9,10],[4,5,6],[10,20,30],[1,2,3],[3,4,5],[5,6,7],[7,8,9],[100,100,100],[11,22,33],[12,11,5],[5,9,10],[4,5,6],[10,20,30],[1,2,3],[3,4,5],[5,6,7],[7,8,9],[100,100,100],[11,22,33],[12,11,5]]
#gammab = [0.001]
if len(sys.argv) != 4:
print 'Incorrect number of arguments.'
print 'Arg: training_file, test_file, output_file.'
return
traindata = getData(sys.argv[1])
testdata = getData(sys.argv[2])
gammab = [1/float(len(traindata)),0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.5,1,5,10]
print 'Got training and test data.'
beta = getBeta(traindata, testdata, gammab)
writeFile(sys.argv[3], beta)
if __name__ == '__main__':
main()

343
FUSION/manager.py Normal file
View File

@ -0,0 +1,343 @@
from __future__ import print_function
from properties import Properties
from kliep import Kliep
from ensemble import Ensemble
from stream import Stream
from sklearn import svm#, grid_search
import time, sys, datetime
import numpy as np
import random, math
import gaussianModel as gm
#from py4j.java_gateway import JavaGateway, GatewayParameters, CallbackServerParameters
class Manager(object):
def __init__(self, sourceFile, targetFile):
self.SDataBufferArr = None #2D array representation of self.SDataBuffer
self.SDataLabels = None
self.TDataBufferArr = None #2D array representation of self.TDataBuffer
self.TDataLabels = None
self.useKliepCVSigma = Properties.useKliepCVSigma
self.kliep = None
self.useSvmCVParams = Properties.useSvmCVParams
self.ensemble = Ensemble(Properties.ENSEMBLE_SIZE)
self.initialWindowSize = int(Properties.INITIAL_DATA_SIZE)
self.maxWindowSize = int(Properties.MAX_WINDOW_SIZE)
self.enableForceUpdate = int(Properties.enableForceUpdate)
self.forceUpdatePeriod = int(Properties.forceUpdatePeriod)
"""
- simulate source and target streams from corresponding files.
"""
print("Reading the Source Dataset")
self.source = Stream(sourceFile, Properties.INITIAL_DATA_SIZE)
print("Reading the Target Dataset")
self.target = Stream(targetFile, Properties.INITIAL_DATA_SIZE)
print("Finished Reading the Target Dataset")
Properties.MAXVAR = self.source.initialData.shape[0]
"""
Detect drift on a given data stream.
Returns the change point index on the stream array.
"""
def __detectDrift(self, slidingWindow, flagStream):
changePoint = -1
if flagStream == 0:
changePoint = self.changeDetector.detectSourceChange(slidingWindow)
elif flagStream == 1:
changePoint = self.changeDetector.detectTargetChange(slidingWindow)
else:
raise Exception('flagStream var has value ' + str(flagStream) + ' that is not supported.')
return changePoint
"""
Write value (accuracy or confidence) to a file with DatasetName as an identifier.
"""
def __saveResult(self, acc, datasetName):
with open(datasetName + '_' + Properties.OUTFILENAME, 'a') as f:
f.write(str(acc) + "\n")
f.close()
def convListOfDictToNDArray(self, listOfDict):
arrayRep = []
if not listOfDict:
return arrayRep
arrayRep = np.array([[float(v)] for k,v in listOfDict[0].items() if k!=-1])
for i in range(1, len(listOfDict)):
arrayRep = np.append(arrayRep, np.array([[float(v)] for k,v in listOfDict[i].items() if k!=-1]), axis=1)
return arrayRep
def collectLabels(self, listOfDict):
labels = []
for d in listOfDict:
labels.append(str(d[-1]))
return labels
"""
The main method handling multistream classification using KLIEP.
"""
def startFusion(self, datasetName, probFromSource):
#save the timestamp
globalStartTime = time.time()
Properties.logger.info('Global Start Time: ' + datetime.datetime.fromtimestamp(globalStartTime).strftime('%Y-%m-%d %H:%M:%S'))
#open files for saving accuracy and confidence
fAcc = open(datasetName + '_' + Properties.OUTFILENAME, 'w')
fConf = open(datasetName + '_confidence' + '_' + Properties.OUTFILENAME, 'w')
#initialize gaussian models
gmOld = gm.GaussianModel()
gmUpdated = gm.GaussianModel()
#variable to track forceupdate period
idxLastUpdate = 0
#Get data buffer
self.SDataBufferArr = self.source.initialData
self.SDataLabels = self.source.initialDataLabels
self.TDataBufferArr = self.target.initialData
#first choose a suitable value for sigma
self.kliep = Kliep(Properties.kliepParEta, Properties.kliepParLambda, Properties.kliepParB, Properties.kliepParThreshold, Properties.kliepDefSigma)
#self.kliep = Kliep(Properties.kliepParEta, Properties.kliepParLambda, Properties.kliepParB, Properties.MAXVAR*Properties.kliepParThreshold, Properties.kliepDefSigma)
if self.useKliepCVSigma==1:
self.kliep.kliepDefSigma = self.kliep.chooseSigma(self.SDataBufferArr, self.TDataBufferArr)
#calculate alpha values
#self.kliep.kliepDefSigma = 0.1
Properties.logger.info('Estimating initial DRM')
gmOld.alphah, kernelMatSrcData, kernelMatTrgData, gmOld.refPoints = self.kliep.KLIEP(self.SDataBufferArr, self.TDataBufferArr)
#initialize the updated gaussian model
gmUpdated.setAlpha(gmOld.alphah)
gmUpdated.setRefPoints(gmOld.refPoints)
#now resize the windows appropriately
self.SDataBufferArr = self.SDataBufferArr[:, -Properties.MAX_WINDOW_SIZE:]
self.SDataLabels = self.SDataLabels[-Properties.MAX_WINDOW_SIZE:]
self.TDataBufferArr = self.TDataBufferArr[:, -Properties.MAX_WINDOW_SIZE:]
kernelMatSrcData = kernelMatSrcData[-Properties.MAX_WINDOW_SIZE:,:]
kernelMatTrgData = kernelMatTrgData[-Properties.MAX_WINDOW_SIZE:,:]
#meanDistSrcData = self.kliep.colWiseMeanTransposed(kernelMatSrcData)
Properties.logger.info('Initializing Ensemble with the first model')
#target model
#first calculate weight for source instances
weightSrcData = self.kliep.calcInstanceWeights(kernelMatSrcData, gmUpdated.alphah)
#since weightSrcData is a column matrix, convert it to a list before sending to generating new model
SDataBufferArrTransposed = self.SDataBufferArr.T
TDataBufferArrTransposed = self.TDataBufferArr.T
if self.useSvmCVParams == 1:
params = {'gamma': [2 ** 2, 2 ** -16], 'C': [2 ** -6, 2 ** 15]}
svr = svm.SVC()
opt = grid_search.GridSearchCV(svr, params)
opt.fit(SDataBufferArrTransposed.tolist(), self.SDataLabels)
optParams = opt.best_params_
self.ensemble.generateNewModelKLIEP(SDataBufferArrTransposed, self.SDataLabels,
TDataBufferArrTransposed, weightSrcData[0].tolist(),
optParams['C'], optParams['gamma'])
else:
self.ensemble.generateNewModelKLIEP(SDataBufferArrTransposed.tolist(), self.SDataLabels,
TDataBufferArrTransposed.tolist(), weightSrcData[0].tolist(),
Properties.svmDefC, Properties.svmDefGamma, Properties.svmKernel)
Properties.logger.info(self.ensemble.getEnsembleSummary())
sDataIndex = 0
tDataIndex = 0
trueTargetNum = 0
trueSourceNum = 0
targetConfSum = 0
#enoughInstToUpdate is used to see if there are enough instances in the windows to
#estimate the weights
Properties.logger.info('Starting MultiStream Classification with FUSION')
#while self.target.data.shape[1] > tDataIndex:
while len(self.source.data.T) + len(self.target.data.T) > sDataIndex + tDataIndex:
ratio = (len(self.source.data.T) - sDataIndex) / (len(self.source.data.T) + len(self.target.data.T) - sDataIndex + tDataIndex + 0.0)
"""
if source stream is not empty, do proper sampling. Otherwise, just take
the new instance from the target isntance.
"""
# if self.source.data.shape[1] > sDataIndex:
# fromSource = random.uniform(0,1)<probFromSource
# else:
# print("\nsource stream sampling not possible")
# fromSource = False
if (np.random.rand() <= ratio and sDataIndex < len(self.source.data.T)) or (tDataIndex >= len(self.target.data.T)):
fromSource = True
elif tDataIndex < len(self.target.data.T):
fromSource = False
if fromSource:
print('S', end="")
#print("Source data index: ", sDataIndex)
#print("\nlen(self.SDataBufferList) = ", len(self.SDataBufferList), ": source window slides")
#remove the first instance, and add the new instance in the buffers
newSrcDataArr = self.source.data[:, sDataIndex][np.newaxis].T
resSource = self.ensemble.evaluateEnsembleKLIEP(np.reshape(newSrcDataArr, (1, -1)))
if isinstance(resSource[0], float) and abs(resSource[0]-self.source.dataLabels[sDataIndex])<0.0001:
trueSourceNum += 1
elif resSource[0] == self.source.dataLabels[sDataIndex]:
trueSourceNum += 1
sacc = float(trueSourceNum)/(sDataIndex+1)
self.SDataBufferArr = self.SDataBufferArr[:, 1:]
self.SDataLabels = self.SDataLabels[1:]
kernelMatSrcData = kernelMatSrcData[1:, :]
#add new instance to the buffers
self.SDataBufferArr = np.append(self.SDataBufferArr, newSrcDataArr, axis=1)
self.SDataLabels.append(self.source.dataLabels[sDataIndex])
#update kernelMatSrcData
dist_tmp = np.power(np.tile(newSrcDataArr, (1, gmUpdated.refPoints.shape[1])) - gmUpdated.refPoints, 2)
dist_2 = np.sum(dist_tmp, axis=0, dtype='float64')
kernelSDataNewFromRefs = np.exp(-dist_2 / (2 * math.pow(self.kliep.kliepDefSigma, 2)), dtype='float64')
kernelMatSrcData = np.append(kernelMatSrcData, kernelSDataNewFromRefs[np.newaxis], axis=0)
#print("Satisfying the constrains.")
gmUpdated.alphah, kernelMatSrcData = self.kliep.satConstraints(self.SDataBufferArr, self.TDataBufferArr, gmUpdated.refPoints, gmUpdated.alphah, kernelMatSrcData)
sDataIndex += 1
else:
# Target Stream
print('T', end="")
newTargetDataArr = self.target.data[:, tDataIndex][np.newaxis].T
# get Target Accuracy on the new instance
resTarget = self.ensemble.evaluateEnsembleKLIEP(np.reshape(newTargetDataArr, (1,-1)))
if isinstance(resTarget[0], float) and abs(resTarget[0]-self.target.dataLabels[tDataIndex])<0.0001:
trueTargetNum += 1
elif resTarget[0] == self.target.dataLabels[tDataIndex]:
trueTargetNum += 1
acc = float(trueTargetNum)/(tDataIndex+1)
if (tDataIndex%100)==0:
Properties.logger.info('\nTotal test instance: '+ str(tDataIndex+1) + ', correct: ' + str(trueTargetNum) + ', accuracy: ' + str(acc))
fAcc.write(str(acc)+ "\n")
conf = resTarget[1] # confidence
# save confidence
targetConfSum += conf
fConf.write(str(float(targetConfSum)/(tDataIndex+1))+ "\n")
#update alpha, and satisfy constraints
#print("Update alpha and satisfy constrains")
gmUpdated.alphah, kernelMatSrcData = self.kliep.updateAlpha(self.SDataBufferArr, self.TDataBufferArr, newTargetDataArr, gmUpdated.refPoints, gmUpdated.alphah, kernelMatSrcData)
#print("\nlen(self.TDataBufferList) = ", len(self.TDataBufferList), ": target window slides")
#remove the first instance from buffers
self.TDataBufferArr = self.TDataBufferArr[:, 1:]
#update ref points
gmUpdated.refPoints = gmUpdated.refPoints[:, 1:]
# update kernelMatSrcData, as ref points has been updated
kernelMatSrcData = kernelMatSrcData[:, 1:]
# update kernelMatTrgData, as ref points has been updated
kernelMatTrgData = kernelMatTrgData[1:, 1:]
#update ref points
gmUpdated.refPoints = np.append(gmUpdated.refPoints, newTargetDataArr, axis=1)
#add to kernelMatSrcData for the last ref point
dist_tmp = np.power(
np.tile(newTargetDataArr,
(1, self.SDataBufferArr.shape[1])) - self.SDataBufferArr, 2)
dist_2 = np.sum(dist_tmp, axis=0, dtype='float64')
kernel_dist_2 = np.exp(-dist_2 / (2 * math.pow(self.kliep.kliepDefSigma, 2)), dtype='float64')
kernelMatSrcData = np.append(kernelMatSrcData, kernel_dist_2[np.newaxis].T, axis=1)
#now update kernelMatTrgData, as ref points has been updated
#first add distance from the new ref points to all the target points
dist_tmp = np.power(
np.tile(newTargetDataArr,
(1, self.TDataBufferArr.shape[1])) - self.TDataBufferArr, 2)
dist_2 = np.sum(dist_tmp, axis=0, dtype='float64')
kernel_dist_2 = np.exp(-dist_2 / (2 * math.pow(self.kliep.kliepDefSigma, 2)), dtype='float64')
kernelMatTrgData = np.append(kernelMatTrgData, kernel_dist_2[np.newaxis].T, axis=1)
#now add distances for the newly added instance to all the ref points
#add the new instance to the buffers
self.TDataBufferArr = np.append(self.TDataBufferArr, newTargetDataArr, axis=1)
dist_tmp = np.power(np.tile(newTargetDataArr, (1, gmUpdated.refPoints.shape[1])) - gmUpdated.refPoints, 2)
dist_2 = np.sum(dist_tmp, axis=0, dtype='float64')
kernelTDataNewFromRefs = np.exp(-dist_2 / (2 * math.pow(self.kliep.kliepDefSigma, 2)), dtype='float64')
kernelMatTrgData = np.append(kernelMatTrgData, kernelTDataNewFromRefs[np.newaxis], axis=0)
tDataIndex += 1
#print "sDataIndex: ", str(sDataIndex), ", tDataIndex: ", str(tDataIndex)
changeDetected = False
changeScore = 0
enoughInstToUpdate = self.SDataBufferArr.shape[1]>=Properties.kliepParB and self.TDataBufferArr.shape[1]>=Properties.kliepParB
if enoughInstToUpdate:
#print("Enough points in source and target sliding windows. Attempting to detect any change of distribution.")
changeDetected, changeScore, kernelMatTrgData = self.kliep.changeDetection(self.TDataBufferArr, gmOld.refPoints, gmOld.alphah, gmUpdated.refPoints, gmUpdated.alphah, kernelMatTrgData)
#print("Change Score: ", changeScore)
#instances from more than one class are needed for svm training
if len(set(self.SDataLabels))>1 and (changeDetected or (self.enableForceUpdate and (tDataIndex + sDataIndex - idxLastUpdate)>self.forceUpdatePeriod)): #or (tDataIndex>0 and (targetConfSum/tDataIndex)<0.1):
fConf.write(str(7777777.0) + "\n")
Properties.logger.info(
'\n-------------------------- Change of Distribution ------------------------------------')
Properties.logger.info('Change of distribution found')
Properties.logger.info(
'sDataIndex=' + str(sDataIndex) + '\ttDataIndex=' + str(tDataIndex))
Properties.logger.info('Change Detection Score: ' + str(changeScore) + ', Threshold: ' + str(self.kliep.kliepParThreshold))
#Build a new model
#First calculate the weights for each source instances
gmOld.alphah, kernelMatSrcData, kernelMatTrgData, gmOld.refPoints = self.kliep.KLIEP(self.SDataBufferArr,
self.TDataBufferArr)
#update the updated gaussian model as well
gmUpdated.setAlpha(gmOld.alphah)
gmUpdated.setRefPoints(gmOld.refPoints)
weightSrcData = self.kliep.calcInstanceWeights(kernelMatSrcData, gmUpdated.alphah)
#Build a new model
Properties.logger.info('Training a model due to change detection')
SDataBufferArrTransposed = self.SDataBufferArr.T
TDataBufferArrTransposed = self.TDataBufferArr.T
if self.useSvmCVParams==1:
params = {'gamma': [2 ** 2, 2 ** -16], 'C': [2 ** -6, 2 ** 15]}
svr = svm.SVC()
opt = grid_search.GridSearchCV(svr, params)
opt.fit(SDataBufferArrTransposed.tolist(), self.SDataLabels)
optParams = opt.best_params_
self.ensemble.generateNewModelKLIEP(SDataBufferArrTransposed.tolist(), self.SDataLabels,
TDataBufferArrTransposed.tolist(), weightSrcData[0].tolist(),
optParams['C'], optParams['gamma'])
else:
self.ensemble.generateNewModelKLIEP(SDataBufferArrTransposed.tolist(), self.SDataLabels,
TDataBufferArrTransposed.tolist(), weightSrcData[0].tolist(),
Properties.svmDefC, Properties.svmDefGamma,
Properties.svmKernel)
Properties.logger.info(self.ensemble.getEnsembleSummary())
#update the idx
idxLastUpdate = tDataIndex + sDataIndex
changeDetected = False
#keep the latest 1/4th of data and update the arrays and lists
#Properties.logger.info('Updating source and target sliding windows')
"""
In the target window, we want to keep (3x/4) instances, where x is the number of gaussian kernel centers,
So that we will try for detecting change point again after (x/4) instances. Since there might be a diff
between arrival rate in the source and target, we calculate number of points to retain in the source
keeping that in mind.
"""
#numberOfPointsInTargetToRetain = Properties.kliepParB - int(((1-probFromSource)*3*Properties.kliepParB)/4)
#numberOfPointsInSourceToRetain = Properties.kliepParB - int((probFromSource*3*Properties.kliepParB)/4)
#save the timestamp
fConf.close()
fAcc.close()
globalEndTime = time.time()
Properties.logger.info(
'\nGlobal Start Time: ' + datetime.datetime.fromtimestamp(globalEndTime).strftime('%Y-%m-%d %H:%M:%S'))
Properties.logger.info('Total Time Spent: ' + str(globalEndTime-globalStartTime) + ' seconds')
Properties.logger.info('Done !!')
return sacc, acc, globalEndTime-globalStartTime

BIN
FUSION/manager.pyc Normal file

Binary file not shown.

69
FUSION/model.py Normal file
View File

@ -0,0 +1,69 @@
import math, sklearn.metrics.pairwise as sk
from sklearn import svm
import numpy as np
import random, sys
class Model(object):
def __init__(self):
self.model = None
self.weight = 0.0
"""
Initialize training of a new weighted SVM model by choosing best parameters.
Sets the trained model for this object.
"""
def trainUsingKLIEPWeights(self, traindata, trainLabels, weightSrcData, maxvar, svmC, svmGamma, svmKernel):
self.model = svm.SVC(decision_function_shape='ovr', probability=True, C=svmC, gamma=svmGamma, kernel=svmKernel)
self.model.fit(traindata, trainLabels)
"""
Test the weighted SVM to predict labels of a given test data.
Returns the result of prediction, and confidence behind the prediction
"""
def test(self, testdata):
#predict and gather results
#predictedClass = ["" for x in range(len(testdata))]
#confidences = np.zeros(len(testdata))
confidences = []
#reshapedData = np.reshape(testdata, (1,-1))
if len(testdata)==1:
testdata = np.reshape(testdata, (1,-1))
predictions = self.model.predict(testdata)
probs = self.model.predict_proba(testdata)
for i in range(0, len(testdata)):
#curData = np.reshape(testdata[i], (1,-1))
#predictedClass[i] = self.model.predict(curData)[0]
for j in range(len(self.model.classes_)):
if self.model.classes_[j] == predictions[i]:
#confidences[i] = prob[j]
confidences.append(probs[i][j])
break
"""
scores = self.model.decision_function(curData)
if len(self.model.classes_)<=2:
confidences[i] = min(1.0, math.fabs(scores[0]))
else:
# we calculate the confidence by taking normalized score
totScore = 0.0
for x, y in zip(self.model.classes_, scores[0]):
totScore += math.fabs(y)
if predictedClass[i] == x:
confidences[i] = math.fabs(y)
confidences[i] /= totScore
"""
return predictions, confidences
"""
Set model weights using test prediction.
For source weight, use error rate with known source data labels.
For target weight, use confidence (or probability) measure on target data.
"""
def computeModelWeightKLIEP(self, data, maxvar):
totConf = 0.0
predictedClass, confidences = self.test(data)
for i in range(0, len(confidences)):
totConf += confidences[i]
return totConf/len(data)

BIN
FUSION/model.pyc Normal file

Binary file not shown.

29
FUSION/multistream.py Normal file
View File

@ -0,0 +1,29 @@
import sys
from manager import Manager
from properties import Properties
import time
"""
Parameters
datasetName: main part of dataset file name, e.g., powersupply for powersupply_source_stream.csv, powersupply_target_stream.csv
baseline: 1=startMscKLIEP, 2=start, 3=start2, 4=start_skmm, 5=start_mkmm, 6=start_srconly, 7=start_trgonly
"""
def main(datasetName, opt=1.0):
props = Properties('config.properties', datasetName)
srcfile = Properties.BASEDIR + datasetName + Properties.SRCAPPEND
trgfile = Properties.BASEDIR + datasetName + Properties.TRGAPPEND
mgr = Manager(srcfile, trgfile)
Properties.logger.info(props.summary())
Properties.logger.info('Start Stream Simulation')
source_cr, target_cr, training_time = mgr.startFusion(datasetName, opt)
print ""
print {'SourceCR': source_cr, 'TargetCR': target_cr, 'TrainingTime': training_time}
return {'SourceCR': source_cr, 'TargetCR': target_cr, 'TrainingTime': training_time}
"""
if __name__ == '__main__':
main()
"""

BIN
FUSION/multistream.pyc Normal file

Binary file not shown.

126
FUSION/properties.py Normal file
View File

@ -0,0 +1,126 @@
import logging, subprocess
import math
import threading, random
class Properties(object):
useKliepCVSigma = 0
kliepDefSigma = 0.00
kliepParEta = 0.0
kliepParLambda = 0.00
kliepParB = 0
kliepParThreshold = 0.0
useSvmCVParams = 1
svmDefGamma = 0.0001
svmDefC = 131072
ENSEMBLE_SIZE = 0
CONFTHRESHOLD = 0.0
CONFCUTOFF = 0.0
IDENTIFIER = ''
OUTFILENAME = ''
TEMPDIR = ''
LOGFILE = ''
MAXVAR = 0
BASEDIR = ''
SRCAPPEND = ''
TRGAPPEND = ''
logger = None
GAMMA = 0.0
CUSHION = 0
SENSITIVITY = 0.0
MAX_WINDOW_SIZE = 0
INITIAL_DATA_SIZE = 0
enableForceUpdate = 0
forceUpdatePeriod = 0
def __init__(self, propfilename, datasetName):
dict = {}
with open(propfilename) as f:
for line in f:
(key,val) = line.split('=')
dict[key.strip()] = val.strip()
self.__class__.useKliepCVSigma=int(dict['useKliepCVSigma'])
self.__class__.kliepDefSigma = float(dict['kliepDefSigma'])
self.__class__.kliepParEta = float(dict['kliepParEta'])
self.__class__.kliepParLambda = float(dict['kliepParLambda'])
self.__class__.kliepParB = int(dict['kliepParB'])
self.__class__.kliepParThreshold = -math.log(float(dict['sensitivity']))
self.__class__.useSvmCVParams=int(dict['useSvmCVParams'])
self.__class__.svmDefGamma=float(dict['svmDefGamma'])
self.__class__.svmDefC=int(dict['svmDefC'])
self.__class__.svmKernel=str(dict['kernel'])
self.__class__.ENSEMBLE_SIZE = int(dict['ensemble_size'])
self.__class__.CONFTHRESHOLD = float(dict['confthreshold'])
self.__class__.CONFCUTOFF = float(dict['confcutoff'])
self.__class__.MAXVAR = 0
self.__class__.BASEDIR = dict['baseDir']
self.__class__.SRCAPPEND = dict['srcfileAppend']
self.__class__.TRGAPPEND = dict['trgfileAppend']
self.__class__.GAMMA = float(dict['gamma'])
self.__class__.CUSHION = int(dict['cushion'])
self.__class__.SENSITIVITY = float(dict['sensitivity'])
self.__class__.MAX_WINDOW_SIZE = int(dict['maxWindowSize'])
self.__class__.INITIAL_DATA_SIZE = int(dict['initialDataSize'])
self.__class__.enableForceUpdate = int(dict['enableForceUpdate'])
self.__class__.forceUpdatePeriod = int(dict['forceUpdatePeriod'])
self.__class__.IDENTIFIER = datasetName + '_' + str(self.__class__.INITIAL_DATA_SIZE) \
+ '_' + str(self.__class__.MAX_WINDOW_SIZE)
self.__class__.OUTFILENAME = self.__class__.IDENTIFIER + '_' + dict['output_file_name']
self.__class__.TEMPDIR = dict['tempDir']
self.__class__.LOGFILE = self.__class__.IDENTIFIER + '_' + dict['logfile']
if self.__class__.logger: self.__class__.logger = None
self.__class__.logger = self.__setupLogger()
#self.__class__.PY4JPORT = random.randint(25333, 30000)
#t = threading.Thread(target=self.__startCPDJava)
#t.daemon = True
#t.start()
def __startCPDJava(self):
subprocess.call(['java', '-jar', 'change_point.jar', str(self.__class__.GAMMA), str(self.__class__.SENSITIVITY), str(self.__class__.MAX_WINDOW_SIZE), str(self.__class__.CUSHION), str(self.__class__.CONFCUTOFF), str(self.__class__.PY4JPORT)])
def __setupLogger(self):
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
logger.addHandler(sh)
handler = logging.FileHandler(self.__class__.LOGFILE)
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
def summary(self):
line = 'Parameter values are as follows:'
line += '\nuseKliepCVSigma = ' + str(self.useKliepCVSigma)
line += '\nkliepDefSigma = ' + str(self.kliepDefSigma)
line += '\nkliepParEta = ' + str(self.kliepParEta)
line += '\nkliepParLambda = ' + str(self.kliepParLambda)
line += '\nkliepParB = ' + str(self.kliepParB)
line += '\nkliepParThreshold = ' + str(self.kliepParThreshold)
line += '\nuseSvmCVParams = ' + str(self.useSvmCVParams)
line += '\nsvmDefGamma = ' + str(self.svmDefGamma)
line += '\nsvmDefC = ' + str(self.svmDefC)
line += '\nsvmKernel = ' + str(self.svmKernel)
line += '\ninitialWindowSize = ' + str(self.INITIAL_DATA_SIZE)
line += '\nmaxWindowSize = ' + str(self.MAX_WINDOW_SIZE)
line += '\nenableForceUpdate = ' + str(self.enableForceUpdate)
line += '\nforceUpdatePeriod = ' + str(self.forceUpdatePeriod)
line += '\nensemble_size = ' + str(self.ENSEMBLE_SIZE)
line += '\nMaximum Num Variables = ' + str(self.MAXVAR)
line += '\nOutput File = ' + str(self.OUTFILENAME)
return line

BIN
FUSION/properties.pyc Normal file

Binary file not shown.

12
FUSION/runScript.py Normal file
View File

@ -0,0 +1,12 @@
from multistream import main
"""
This script is for calling the main function from multistream.py. There are two parameters,
- First parameter is the file location (Without the extension. Extension can be specified from config.properties)
- Second parameter is the percentage of data the source stream receives. As an example, if this parameter is 0.1,
it means that randomly 10% of incoming instances go to the source stream, and rest 90% go to the target stream.
"""
print "Running FC"
main('usps_mnist', 0.1)
print "Done"

View File

@ -0,0 +1,29 @@
import random
import sys
sf = open(sys.argv[1])
tf = open(sys.argv[2])
of = open(sys.argv[3], mode='w')
numLines = 0
while True:
randomNum = random.uniform(0,1)
if randomNum < 0.5:
sl = sf.readline()
if not sl:
break
of.write(sl)
else:
tl = tf.readline()
if not tl:
break
of.write(tl)
numLines += 1
if numLines % 1000 == 0:
print("Process Lines: ", numLines)
for line in sf:
of.write(line)
for line in tf:
of.write(line)
sf.close()
tf.close()
of.close()