Cross validation on Weka decision Tree


% Pree Thiengburanathum
% File Name: DecisionTreeC4.5.m
% Last updated 5 December 2014
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Description:
% This function responses constructing the decision tree c4.5 model using
% weka library, and evaluate the performce of feature selection algorithms
% using the balanced k-fold cross validation.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Input:
% inputs - pre-processed vectors of independent variables
% algoName - the name of validate feature selection algorithm
%
% Output:
% bestAccuRate - the best accuracy
% bestModel - the best model
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
function [bestAccuRate, bestModel] = DecisionTreeC4p5(inputs, algoName)
disp('Running Decision Tree C4.5...');
outFileName = strcat(algoName, '_DT_C4.5.txt');
fileID = fopen(algoName, 'wt');
relationName = 'tpd';
bestAccuRate = 0;
bestModel = NaN;

nRun = 5;
nFea = size(inputs, 2);
nFold = 10;

for k=1:nRun
for i=1:3:nFea
X = inputs(:, 1:i);
% randomly generated indices for a balanced K-fold cross-validation
[trainIdx, testIdx] = sampling(table2array(X)', table2array(inputs(:, end))');
totalAccuRate = [];
for j=1:nFold
fprintf('Run# %sn', int2str(k));
fprintf('Important feature = %sn', int2str(i));
fprintf('Balanced K-folds Cross validation fold: %sn', int2str(j));
trainData = X(trainIdx(j, :), :);
trainData = [trainData inputs(trainIdx(j, :), end)];

testData = X(testIdx(j, :), :);
testData = [testData inputs(testIdx(j, :), end)];

disp('Converting to weka Java object...');
wekaTrainObj = Matlab2weka(relationName, trainData.Properties.VariableNames, table2array(trainData));
wekaTestObj = Matlab2weka(relationName, testData.Properties.VariableNames, table2array(testData));

%{
SaveARFF(strcat(algoName, '_test_', int2str(i), '.arff'), wekaTrainObj);
SaveARFF(strcat(algoName, '_train_', int2str(i), '.arff'), wekaTestObj);
%}

model = trainWekaClassifier(wekaTrainObj, 'trees.J48');

% test the classifier model
predicted = wekaClassify(wekaTestObj, model);

% the actual class index value according to the test dataset
actual = wekaTestObj.attributeToDoubleArray(wekaTestObj.numAttributes - 1);

accuRate = sum(actual == predicted)*(100/numel(predicted));

corrected = find(actual == predicted);
incorrected = find(actual ~= predicted);
disp(['number of correctly classified instances: ', int2str(numel(corrected))]);
disp(['number of incorrectly classified instances: ', int2str(numel(incorrected))]);
disp(['accuracy rate= ', num2str(accuRate), '']);
disp(' ');
totalAccuRate(end+1) = accuRate;
end
avgAccuRate = mean(totalAccuRate);
disp('********************************************************');
disp([algoName, ' with ', int2str(i), ' selected features']);
fprintf(fileID, '%s with %s selected featuresn', algoName, int2str(i));
disp([int2str(nFold), ' folds CV accuracy rate = ', num2str(mean(totalAccuRate)), '%']);
fprintf(fileID, '%s folds CV accuracy rate = %s%%n', int2str(nFold), num2str(avgAccuRate));
disp('********************************************************');

if avgAccuRate > bestAccuRate
disp('****found best model****');
bestAccuRate = avgAccuRate;
bestModel = model;
end
end
end
disp('Finished Decision Tree C4.5!');
fclose(fileID);
end % end function DecisionTreeC4p5

Leave a Reply

Your email address will not be published. Required fields are marked *