Machine-learning protocol
# version information: v0.3 # date of update: 04/Apr/2018 library(RDRToolbox) # "RDRToolbox" is the name of an R package used for Isomap nonlinear dimensional reduction library(SNFtool) # "SNFtool" is the name of an R package used for spectral clustering library(caret) # "caret" is the name of an R package used for Random Forest library(randomForest) # "randomForest" is the name of an R package used for Random Forest data_pre <- read.csv("filename.csv") # "filename" is the name of the csv file var_num <- ncol(data_pre) # Number of variables is the original dimension of the data # An example of auto-scaled data prepared for Machine-learning approach # Sample No. |Variable 1 |Variable 2 |Variable 3 | ... # 1 |-1.0 |1.2 |-0.9 | ... # 2 |-1.1 |0.5 |-1.1 | ... # 3 |-1.1 |0.5 |-1.1 | ... # ... |... |... |... | ... # In this case, you need to delete the first column (Sample No.) using the next command # data_pre<- data_pre[,-1] ## Isomap nonlinear dimensional reduction data_mat <- as.matrix(data_pre) data_exam <- Isomap(data = dat_mat, dims = 1:var_num, k = 5, plotResiduals = TRUE, verbose = T) # See the plot of residual variances for each reduced dimensions and decide the number of dimensions data_Iso <- Isomap(data = dat_mat, dims = 3, k = 5) # In this case, the data dimensions are reduced to three ## Spectral clustering data_Iso_Db <- data_Iso$dim3 # Set all the parameters: K = 20; # number of neighbors, usually (10-30) alpha = 0.5; # hyperparameter, usually (0.3-0.8) # Calculate the pair-wise distance; dist_Iso <- dist2(as.matrix(data_Iso_Db),as.matrix(data_Iso_Db)); # Construct similarity graphs: W = affinityMatrix(dist_Iso, K, alpha) # You can also fuse multiple graphs M1, M2, ... by setting parameter T = 20 # (number of Iterations, usually (10-20) and using the following command: # W <- SNF(list(W1, W2), K, T) # Estimate the number of clusters estimationResult <- estimateNumberOfClustersGivenGraph(W, NUMC = 2:5); # NUMC is the number of clusters # This gives you two insights about the data sets. C = 3 # number of clusters group <- spectralClustering(W,C) # the final subtypes information write.csv(group, "ClusteredData.csv") # "ClusteredData.csv" is the result file for spectral clustering ## Random Forest classinfo <- group data_cl <- cbind(classinfo, data_pre) data_cl$classinfo <- ifelse(data_cl$classinfo == 1, "a", data_cl$classinfo) data_cl$classinfo <- ifelse(data_cl$classinfo == 2, "b", data_cl$classinfo) data_cl$classinfo <- ifelse(data_cl$classinfo == 3, "c", data_cl$classinfo) # Examine number of variables randomly chosen at each split of the decision tree fitControl <- trainControl( method = "repeatedcv", number = 5, repeats =10) tGrid <- expand.grid(mtry= (1:7)*1) model.tune <-train(classinfo~., data = data_cl, method="rf", trace=F, trControl = fitControl, tuneGrid = tGrid ) # Set all the parameters: my.mtry = 1 default.ntee = 500 # Random Forest with k-fold cross validation (function) RFkCV <- function(data_cl,k){ # Generate data for k-fold cross validation residu <- k - nrow(data_cl)%%k dumdat <- as.data.frame(matrix(NA, nrow = residu, ncol = ncol(data_cl))) names(dumdat)<- names(data_cl) alldum <- rbind(data_cl, dumdat) alldum.split <- split(alldum,1:k) all.split <- lapply(alldum.split, na.omit) # K-fold cross validation: pv.eval.raw <- as.numeric(NULL) mean.dec.acc <- as.numeric(NULL) Gini.imp <- as.numeric(NULL) for (h in 1:k){ evaldata <- data.matrix(all.split[[h]]) allspr <- all.split[-h] object <- data.frame(do.call("rbind", allspr)) # Random Forest with training data model <- randomForest(classinfo ~ ., data = object,importance = T,ntree = default.ntee, mtry = my.mtry) # Cross validation with evaluating data preds <- predict(model, newdata = evaldata) pred <- t(preds) correct.label <- evaldata[,1] pv.eval <- rbind(correct.label, pred) pv.eval.raw <- cbind(pv.eval.raw, pv.eval) # Calculating importance of variables imp <- importance(model) incM <- imp[,1] incN <- imp[,2] mean.dec.acc <- rbind(mean.dec.acc, incM) Gini.imp <- rbind(Gini.imp, incN) } sum.pv <- table(pv.eval.raw[1,], pv.eval.raw[2,]) mod.acc <- as.numeric(sum(diag(sum.pv))/sum(sum.pv)) # Write the results of Random Forest write.table(pv.eval.raw,"AccRaw.csv",sep = ",") write.table(sum.pv,"AccSum.csv",sep = ",") write.table(mod.acc,"Acc.csv",sep = ",") write.table(mean.dec.acc,"ImpMDA.csv",sep = ",") write.table(Gini.imp,"ImpGini.csv",sep = ",") } # Get the results of Random Forest (useage) RFkCV(data_cl,k = 5)