Machine-learning protocol

# version information: v0.3
# date of update: 04/Apr/2018

library(RDRToolbox)  
# "RDRToolbox" is the name of an R package used for Isomap nonlinear dimensional reduction

library(SNFtool)     
# "SNFtool" is the name of an R package used for spectral clustering

library(caret)
# "caret" is the name of an R package used for Random Forest

library(randomForest)
# "randomForest" is the name of an R package used for Random Forest

data_pre <- read.csv("filename.csv")  # "filename" is the name of the csv file
var_num  <- ncol(data_pre)            # Number of variables is the original dimension of the data

# An example of auto-scaled data prepared for Machine-learning approach
# Sample No. |Variable 1 |Variable 2 |Variable 3 | ...
# 1          |-1.0       |1.2        |-0.9       | ...
# 2          |-1.1       |0.5        |-1.1       | ...
# 3          |-1.1       |0.5        |-1.1       | ...
# ...        |...        |...        |...        | ...
# In this case, you need to delete the first column (Sample No.) using the next command
# data_pre<- data_pre[,-1]

## Isomap nonlinear dimensional reduction
data_mat <- as.matrix(data_pre)

data_exam <- Isomap(data = dat_mat, dims = 1:var_num, k = 5, plotResiduals = TRUE, verbose = T)
# See the plot of residual variances for each reduced dimensions and decide the number of dimensions

data_Iso <- Isomap(data = dat_mat, dims = 3, k = 5) 
# In this case, the data dimensions are reduced to three

## Spectral clustering
data_Iso_Db <- data_Iso$dim3

# Set all the parameters:
K     = 20;  # number of neighbors, usually (10-30)
alpha = 0.5; # hyperparameter, usually (0.3-0.8)

# Calculate the pair-wise distance;
dist_Iso <- dist2(as.matrix(data_Iso_Db),as.matrix(data_Iso_Db));

# Construct similarity graphs:
W = affinityMatrix(dist_Iso, K, alpha)

# You can also fuse multiple graphs M1, M2, ... by setting parameter T = 20 
# (number of Iterations, usually (10-20) and using the following command:
# W <- SNF(list(W1, W2), K, T)

# Estimate the number of clusters
estimationResult <- estimateNumberOfClustersGivenGraph(W, NUMC = 2:5); # NUMC is the number of clusters
# This gives you two insights about the data sets.

C = 3  # number of clusters
group <- spectralClustering(W,C)  # the final subtypes information
write.csv(group, "ClusteredData.csv") # "ClusteredData.csv" is the result file for spectral clustering

## Random Forest
classinfo <- group
data_cl <- cbind(classinfo, data_pre)
data_cl$classinfo <- ifelse(data_cl$classinfo == 1, "a", data_cl$classinfo)
data_cl$classinfo <- ifelse(data_cl$classinfo == 2, "b", data_cl$classinfo)
data_cl$classinfo <- ifelse(data_cl$classinfo == 3, "c", data_cl$classinfo)

# Examine number of variables randomly chosen at each split of the decision tree
fitControl <- trainControl( method = "repeatedcv",  number = 5,  repeats =10)
tGrid      <-  expand.grid(mtry= (1:7)*1)
model.tune <-train(classinfo~., data = data_cl, method="rf", trace=F, 
                   trControl = fitControl, 
                   tuneGrid = tGrid )

# Set all the parameters:
my.mtry = 1
default.ntee = 500

# Random Forest with k-fold cross validation (function)
RFkCV <- function(data_cl,k){
  
  # Generate data for k-fold cross validation
  residu       <- k - nrow(data_cl)%%k
  dumdat       <- as.data.frame(matrix(NA, nrow = residu, ncol = ncol(data_cl)))
  names(dumdat)<- names(data_cl)
  alldum       <- rbind(data_cl, dumdat)
  alldum.split <- split(alldum,1:k)
  all.split    <- lapply(alldum.split, na.omit)
  
  # K-fold cross validation:
  pv.eval.raw  <- as.numeric(NULL)
  mean.dec.acc <- as.numeric(NULL)
  Gini.imp     <- as.numeric(NULL)
  for (h in 1:k){
    evaldata <- data.matrix(all.split[[h]])
    allspr   <- all.split[-h]
    object   <- data.frame(do.call("rbind", allspr))
    
    # Random Forest with training data
    model <- randomForest(classinfo ~ ., data = object,importance = T,ntree = default.ntee, mtry = my.mtry) 
    
    # Cross validation with evaluating data
    preds         <- predict(model, newdata = evaldata)
    pred          <- t(preds)
    correct.label <- evaldata[,1]
    pv.eval       <- rbind(correct.label, pred)
    pv.eval.raw   <- cbind(pv.eval.raw, pv.eval)
    
    # Calculating importance of variables
    imp          <- importance(model)
    incM         <- imp[,1]
    incN         <- imp[,2]
    mean.dec.acc <- rbind(mean.dec.acc, incM)
    Gini.imp     <- rbind(Gini.imp, incN)
    }
  
  sum.pv  <- table(pv.eval.raw[1,], pv.eval.raw[2,])
  mod.acc <- as.numeric(sum(diag(sum.pv))/sum(sum.pv))

# Write the results of Random Forest
  write.table(pv.eval.raw,"AccRaw.csv",sep = ",")
  write.table(sum.pv,"AccSum.csv",sep = ",")
  write.table(mod.acc,"Acc.csv",sep = ",")
  write.table(mean.dec.acc,"ImpMDA.csv",sep = ",")
  write.table(Gini.imp,"ImpGini.csv",sep = ",")
}

# Get the results of Random Forest (useage)
RFkCV(data_cl,k = 5)