# This is the sota algorithm especially built for bioinformatics
# args[1] = the path of the current experiment
# args[2] = the name of the input file
# args[3] = the name of the output file
# args[4] = success message
# args[5] = what experiment times to use (TR, RA, TR and RA)
# args[6] = separator of data in the source file
# args[7] = type of output (cluster print or heatmap)
# args[8] = output format (SVG, JPG, PNG)
# args[9] = normalization Method (None, Division, Log)
# args[10] = normalization place (After reading data - before clustering, After clustering - before displaying)
# args[11] = color for heatmap (green to red, yellow to blue)

# args[12] = maxCycles: integer value representing the maximum number of iterations
#          allowed. The resulting number of clusters returned by ‘sota’
#          is maxCycles+1 unless ‘unrest.growth’ is set to FALSE and the
#          ‘maxDiversity’ criteria is satisfied prior to reaching the
#          maximum number of iterations.
# args[13] = maxEpochs: integer value indicating the maximum number of training
#          epochs allowed per cycle. By default, ‘maxEpochs’ is set to
#          1000.
# args[14] = distance: character string used to represent the metric to be used for
#          calculating dissimilarities between profiles. 'euclidean' is
#          the default, with 'correlation' being another option.
# args[15] = wcell: value specifying the winning cell migration weight. The
#          default is 0.01.
# args[16] = pcell: value specifying the parent cell migration weight. The
#          default is 0.005.
# args[17] = scell: value specifying the sister cell migration weight. The
#        default is 0.001.
# args[18] = delta: value specifying the minimum epoch error improvement. This
#          value is used as a threshold for signaling the start of a new
#          cycle. It is set to 1e-04 by default.
# args[19] = neighb.level: integer value used to indicate which cells are candidates
#          to accept new profiles. This number specifies the number of
#          levels up the tree the algorithm moves in the search of
#          candidate cells for the redistribution of profiles. The
#          default is 0.
# args[20] = maxDiversity: value representing a maximum variability allowed within a
#          cluster. 0.9 is the default value
# args[21] = unrest.growth: logical flag: if TRUE then the algorithm will run
#          ‘maxCycles’ iterations regardless of whether the
#          ‘maxDiversity’ criteria is satisfied or not and ‘maxCycles’+1
#          clusters will be produced; if FALSE then the algorithm can
#          potentially stop before reaching the ‘maxCycles’ based on the
#          current state of cluster diversities. A smaller than usual
#          number of clusters will be obtained. The default value is
#          TRUE.


# point to the location of the libs
.libPaths('lib/linux/')
library("clValid")
library("gplots")
#require("Heatplus")
library("stats")
library("RSvgDevice")
library("amap")
library("ape")
require(graphics); require(grDevices)
library("ggplot2")

source("/home/andrei/univ/master/eclipse_workspace/PFC/src/pfc/protected/commands/shell/r/common.R")
#source("/home/andrei/sitio/PFC/src/pfc/protected/commands/shell/r/common.R")

.ANCESTOR_COL_IND <- 2
.NODE_ID_IND <- 1
.NODE_TYPE_IND <- 3

args <- commandArgs(TRUE)
# args[1] <- "/home/andrei/univ/master/eclipse_workspace/PFC/docs/data_of_pfc/"
# args[2] <- "Heat_stress_Table_S1_500g.csv"#1341673642Stabilogen.csv"#"Datos_40_good.csv"
# args[3] <- "result-sota"
# args[4] <- "succ"
# args[5] <- .EXP_TIMES_TR_RA
# args[6] <- ";"
# args[7] <- .VOUT_TYPE_DENDROGRAM
# args[8] <- .VOUT_FORMAT_SVG
# args[9] <- .NORM_METHOD_DIV
# args[10] <- .NORM_METHOD_WHEN_AFT_CL
# args[11] <- .HEATMAP_COL_GREEN_RED
# 
# args[12] <- 4
# args[13] <- 1000
# args[14] <- "euclidean"
# args[15] <- 0.01
# args[16] <- 0.005
# args[17] <- 0.001
# args[18] <- 1e-04
# args[19] <- 0
# args[20] <- 0.9
# args[21] <- TRUE
  
## Transforms the tree object (full tree with all nodes, including non-leafs) in a string containing a Newick representation 
sotaToNewick <- function(fullTreeMat, distMethod) 
  {
  ## determine the root node
  rootNode <- NULL
  numGenes <- nrow(fullTreeMat)
  index <- 1
  while (index <= numGenes) {
    if (fullTreeMat[index, .ANCESTOR_COL_IND] == 0) {
      rootNode <- fullTreeMat[index, ]
      break
    } else {
      index <- index+1
    }
  }
  ## delete root to free some space
  if (index <= numGenes) {## we have the root, carry on and build the newick
    #fullTreeMat <- fullTreeMat[-index, ]
    #maxVal <- nrow(fullTreeMat)#max(fullTreeMat[, 4:(ncol(fullTreeMat)-1)])
    #print(maxVal)
    distMat <- dist(fullTreeMat[, 4:(ncol(fullTreeMat)-1)], method = distMethod)
    longestPath <- getLongestPath(fullTreeMat, 0, rootNode, as.matrix(distMat))
    #print(dist(fullTreeMat[, 4:(ncol(fullTreeMat)-1)], method = "euclidean"))
    #print(as.matrix(distMat))
    #print(getLongestPath(fullTreeMat, 0, rootNode, as.matrix(distMat)))
    paste(sep='', recursiveNewick(fullTreeMat, 0, rootNode, as.matrix(distMat), longestPath), ';')
  } #else 
    ## throw error since we couldn't find a root node
  
}

## Builds the ultrametric Newick representation of the  binary rooted tree generated by the sota algorithm
recursiveNewick <- function(fullTreeMat, pathDistance, rootNode, distMat, longestPath) 
{
  #print(rootNode)
  if (rootNode[.NODE_TYPE_IND] == 0) {## 0 means that it isn't a leaf
    result <- childsNode(fullTreeMat, rootNode[.NODE_ID_IND])    
#     maxVal <- dist(rbind(result[4:(ncol(fullTreeMat)-1)], 
#                                     result[(ncol(fullTreeMat)+4):(2*ncol(fullTreeMat)-1)]))
    #print(result[1:ncol(fullTreeMat)])
    #print(result[(ncol(fullTreeMat)+1):(2*ncol(fullTreeMat))])
    
    ## get the distance between the 2 siblings at this level; it will be added to the full path distance
    child1 <- result[1:ncol(fullTreeMat)]
    child2 <- result[(ncol(fullTreeMat)+1):(2*ncol(fullTreeMat))]
    distSiblNodes <- distMat[child2[.NODE_ID_IND], child1[.NODE_ID_IND]]
    output <- ""
    output <- paste(sep='', output, '(')
    output <- paste(sep='', output, recursiveNewick(fullTreeMat, pathDistance + distSiblNodes, 
                                                    child1, distMat, longestPath))
    output <- paste(sep='', output, ',')
    output <- paste(sep='', output, recursiveNewick(fullTreeMat, pathDistance + distSiblNodes, 
                                                    child2, distMat, longestPath))
    output <- paste(sep='', output, '):')
    output <- paste(sep='', output, distSiblNodes)
    output;
  } else {## the current node is a leaf, add it to the newick format
    paste(sep='', paste(sep='', paste(sep='', 'Cluster', rootNode[.NODE_ID_IND])
                        , ':'), (longestPath - pathDistance))
  }  
}

## Finds the farthest node (as a path cost) from the root node
getLongestPath <- function(fullTreeMat, pathDistance, rootNode, distMat) 
{
  #print(rootNode)
  if (rootNode[.NODE_TYPE_IND] == 0) {## 0 means that it isn't a leaf
    result <- childsNode(fullTreeMat, rootNode[.NODE_ID_IND])    
    #     maxVal <- dist(rbind(result[4:(ncol(fullTreeMat)-1)], 
    #                                     result[(ncol(fullTreeMat)+4):(2*ncol(fullTreeMat)-1)]))
    #print(result[1:ncol(fullTreeMat)])
    #print(result[(ncol(fullTreeMat)+1):(2*ncol(fullTreeMat))])
    
    ## get the distance between the 2 siblings at this level; it will be added to the full path distance
    child1 <- result[1:ncol(fullTreeMat)]
    child2 <- result[(ncol(fullTreeMat)+1):(2*ncol(fullTreeMat))]
    distSiblNodes <- distMat[child2[.NODE_ID_IND], child1[.NODE_ID_IND]]
    pathDistanceNew1 <- getLongestPath(fullTreeMat, pathDistance + distSiblNodes, 
                                                    child1, distMat)
    pathDistanceNew2 <- getLongestPath(fullTreeMat, pathDistance + distSiblNodes, 
                                                    child2, distMat)
    max(pathDistanceNew1, pathDistanceNew2)
  } else {## the current node is a leaf, add it to the newick format
    pathDistance
  }  
}

## determines the children of a certain node using the parent and the matrix with all the nodes
childsNode <- function(fullTreeMat, rootNodeId) 
  {
  numGenes <- nrow(fullTreeMat)
  index <- 1
  child1 <- NULL
  child2 <- NULL
  while (index <= numGenes) {
    if (fullTreeMat[index, .ANCESTOR_COL_IND] == rootNodeId) {
      #print(fullTreeMat[index, ])
      if (is.null(child1)) {
        child1 <- fullTreeMat[index, ]
        #fullTreeMat <- fullTreeMat[-index, ]
      } else {
        child2 <- fullTreeMat[index, ]
        #fullTreeMat <- fullTreeMat[-index, ]
        break
      }
    }
      index <- index+1
  }
  
  result <- c(child1, child2)
  #result["child1"] <- child1
  #result["child2"] <- child2
  #attr(result, "child1") <- child1
  #attr(result, "child2") <- child2
  #print(result)
  return(result)
}
#print(paste(sep="/", getwd(), 'common.R'))
#source('common.R')
data <- prepareInput(args[1], args[2], args[5], args[6], args[9], args[10]);
#print(data)
#print(as.matrix(data))
#devSVG(file="temp/clustering.svg")
#data <- as.matrix(data)
#mode(data) <- 'numeric'
#print(data)
sotaCl <- sota(as.matrix(data), maxCycles=as.numeric(args[12]), maxEpochs=as.numeric(args[13]), 
	distance=args[14], wcell=as.numeric(args[15]), pcell=as.numeric(args[16]), scell=as.numeric(args[17]), 
	delta=args[18], 
	neighb.level=as.numeric(args[19]), maxDiversity=as.numeric(args[20]), unrest.growth=args[21])
#print(data)
#print(sotaCl$c.tree)

#print(sotaCl)
print(sotaToNewick(sotaCl$c.tree, args[14]))
tree <- read.tree(text=#'(Cluster2:4,((Cluster6:1,Cluster7:1):2,Cluster5:3):1);')
  sotaToNewick(sotaCl$c.tree, args[14]))
#is.ultrametric(tree)
#is.binary.tree(tree)
#is.rooted(tree)
sotaHcInit <- as.hclust.phylo(tree)
print(sotaHcInit$labels)
sotaHcInit$labels <- c(paste(sep='', 'Cluster', index <- 1:length(sotaHcInit$labels)))
print(sotaHcInit$labels)

# print(sotaCl$tree)
# print(sotaCl$clust)
# print(sotaCl$diversity)
numClusters <- nrow(sotaCl$tree)

## If the result is a dendrogram, prepare the output for the genes' profiles
if (args[7] == .VOUT_TYPE_DENDROGRAM) {
#   ## Determine the number of columns of the profiles plot; If the experiment is TR_RA then it has 2 columns, else 1
#   if (args[5] == .EXP_TIMES_TR_RA) {
#     plotProfCols <- 2
#     plotProfLines <- numClusters+3
#     print(numClusters)
#     matrixPlot <- matrix(c(1, 1, 1, 1, 1, 1, 2:(numClusters*2+1)), plotProfLines, plotProfCols, byrow=TRUE)
#   } else {
#     print(numClusters)
#     plotProfCols <- 3
#     plotProfLines <- ceiling(numClusters / plotProfCols) + 3
#     print(plotProfLines)
#     matrixPlot <- matrix(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2:(plotProfLines*3+1)), plotProfLines, plotProfCols, byrow=TRUE)
#   }
  matrixSettingsPlot <- createMatrixPlot(args[5], numClusters)
  ## In inches
  plotHeight <- (nrow(matrixSettingsPlot)+1) * 1.5
} else {
  ## The default value when the plot is not a dendrogram, in inches
  plotHeight <- 5
}

#png(filename=paste(sep="", args[1], "sota.svg"), width=500, height=500)
if (args[8] == .VOUT_FORMAT_SVG) {
  devSVG(file=paste(sep='', args[1], paste(sep='', args[3], args[8])),
         width = 10, height = plotHeight)
} else if (args[8] == .VOUT_FORMAT_JPG) {
  jpeg(file=paste(sep='', args[1], paste(sep='', args[3], args[8])),
       width=7, height = plotHeight, units = 'in', res = 100)
} else {
  png(file=paste(sep='', args[1], paste(sep='', args[3], args[8])), 
      width = 7, height = plotHeight, units = 'in', res = 100 )
}

## get the real indices of the result to be used in the nameing convention
clId <- as.numeric(attributes(sotaCl$totals)$dimnames$clust)
lenCluster <- length(sotaCl$totals)
print(sotaCl$totals)

## save the clusters with their means in a file
printMatrix <- renameHeaders(args[5], format(data.frame(sotaCl$tree[,-c(1:3)])))
#print(sotaCl$tree)
rownames(printMatrix) <- sotaHcInit$labels#c(paste(sep='', 'Cluster',  clId[index <- 1:lenCluster] ))
saveAsCsv(paste(sep='',args[1], paste(sep='', args[3], '.csv')), printMatrix, args[5])

## save the clusters details in a file
index <- 0
clusterString <- c(paste(sep='', paste(sep='', paste(sep='', 'Cluster',  clId[index <- 1:lenCluster] ), args[6]),  sotaCl$totals[index])  )

print(clId)
## reinit with the number of genes to be able to get their names from the initial data matrix
lenCluster <- length(sotaCl$clust)
genesArray <- rownames(data)
for (i in 1:lenCluster) {
  pos <- which(clId == sotaCl$clust[i], arr.ind = TRUE)
  clusterString[pos] <- paste(sep='', paste(sep='', clusterString[pos], args[6]), genesArray[i])
  
  #print(sotaCl$totals[i])
}
write(clusterString, file = paste(sep='',args[1], paste(sep='', args[3], .FILE_CLUSTER_DETAILS_EXT)), sep='\n', 
      ncolumns = 1)

resultMatrix <- sotaCl$tree[,-c(1:3)]
rownames(resultMatrix) <-  sotaHcInit$labels
colnames(resultMatrix) <- colnames(data)
## normalize data before displaying
if (args[10] == .NORM_METHOD_WHEN_AFT_CL) { 
  ## the case in which we have 2 types of columns needs special attention since it is required to filter
  ## each type of experiment independently
  if (args[5] == .EXP_TIMES_TR_RA) {
    firstColOtherExp <- length(names(data)[grep(.RA_PATTERN, toupper(names(data)))])
    resultMatrix <- normFunction(resultMatrix, args[9], 1, firstColOtherExp)   
    resultMatrix <-normFunction(resultMatrix, args[9], firstColOtherExp+1, ncol(data))      
  } else  {
    resultMatrix <- normFunction(resultMatrix, args[9], 1, ncol(resultMatrix))
  }
}

if (args[7] == .VOUT_TYPE_DENDROGRAM) {
  #cl <- clValid(data, 2:6, clMethods =  "sota")
  #plot(sotaCl$c.tree)
  #hc <- clusters(cl, "sota")
  #plot(as.dendrogram(hc))
  #plot(sotaCl$c.tree)
  #print(sotaCl$tree)
  #express <- data[1:25,]
  #rownames(express) <- data$ID[1:25]
  #print (nrow(as.matrix(na.omit(data))))
  #print(as.matrix(na.omit(data)))
  #intern <- clValid(data.matrix(na.omit(data)), 2:6, clMethods=c("sota"),
  #                  validation="internal")
  
  
    
  ## number of columns of the sota result, without the last one which isthe diversity
  ## Start the plotting process
  #split.screen(c(2,1))  
  #split.screen(c(numClusters, plotProfCols), 2)
  #screen(1)
#   split.screen(c(2,1))
#   screen(1)
#   M <- matrix(matrix(c(1,2,1,2), 2, 2, byrow = TRUE)) 
#   layout(M)
  #op <- par(mfrow = c(2, 1), pty = "s")
  
  #par(op)
  #plot.new()
  #layout.show(numClusters)
  #par(new=FALSE) 
  #par(op)
  #par(op)
#  split.screen(c(plotProfLines, plotProfCols), screen = 2)
  ## Plot the profiles
  #par(mfg = c(2,1))
#    layout.show(2)
#   op <-par(mfrow = c(plotProfLines, plotProfCols), 
#              pty = "s")
#   
  #plot(as.dendrogram(sotaHcInit))
 
  #matplot(sotaCl$tree[, 4:numExpTimesTreeMat], type="l", add = TRUE) 
  #par(op)
  #close.screen(all = TRUE)
  #close.screen(all = TRUE)
  #ggplot(as.data.frame(sotaCl$tree[, 4:numExpTimesTreeMat]), aes(time, value)) + geom_line() + facet_grid(series ~ .)
#   if (args[5] == .EXP_TIMES_TR_RA) {
#     ## Get the number of times
#     firstColOtherExp <- length(names(data)[grep(.RA_PATTERN, toupper(names(data)))])
#     print(firstColOtherExp)
#     ## get hte number of rows in the matrix which will be displayed; It is the number of leafs multiplied by 2 since one line is represented by the TR times and the other by the RA experiment
#     numColsDisplayMat <- nrow(sotaCl$tree)*2
#     ## Init the display matrix
#     displayMat <- matrix(nrow = numColsDisplayMat, ncol = firstColOtherExp)
# #     print(displayMat)
# #     print(length( sotaCl$tree[1, (firstColOtherExp+4):numExpTimesTreeMat]))
#     ## Loop over the elements in the result matrix and extract TR, add it in the display matrix and then extract RA and add it on the next line in the display matrix
#     for (index in seq(1, numColsDisplayMat, 2)) {
#       displayMat[index, ] <- sotaCl$tree[as.integer(index/2)+1, 4:(firstColOtherExp+3)]
#       displayMat[index+1, ] <- sotaCl$tree[as.integer(index/2)+1, (firstColOtherExp+4):numExpTimesTreeMat]
#     }
#     #print(data)
# #     print(sotaCl$tree)
# #     print(displayMat)
#     plotDendroProfiles(displayMat, matrixPlot, as.dendrogram(sotaHcInit))
#     
#   } else {
#     plotDendroProfiles(sotaCl$tree[, 4:numExpTimesTreeMat], matrixSettingsPlot, as.dendrogram(sotaHcInit))
#   }
  
  #numExpTimesTreeMat <- ncol(sotaCl$tree)
  plotDendroProfiles(data, args[5], resultMatrix,#sotaCl$tree[, 4:numExpTimesTreeMat], 
                     matrixSettingsPlot, as.dendrogram(sotaHcInit))
} else if (args[7] == .VOUT_TYPE_HEATMAP) {
  if (args[11] == .HEATMAP_COL_GREEN_RED) {
    colorMapS = greenred(.HEATMAP_NUM_COLS_INTERVAL);
  } else if (args[11] == .HEATMAP_COL_YELLOW_BLUE) {
    colorMapS = colorpanel(.HEATMAP_NUM_COLS_INTERVAL, 'yellow', 'black', 'blue');
  }
  #dev.new(width=5, height=ncol(data))
  heatmap.2(data.matrix(na.omit(resultMatrix))#, 
            ,distfun = function(x) {x}
            #,  lmat=rbind( c(3, 4,0), c(2,1,1)), lwid=c(0.2, 1,0.5 ), lhei= c(0.18,0.5),
            ,Colv = FALSE,
            Rowv= as.dendrogram(sotaHcInit), 
            ,dendrogram = "none"
            , hclustfun = function(x) {x}
            , trace = 'none'
            , na.color = 'black'
            #, col=greenred(.HEATMAP_NUM_COLS_INTERVAL)
            , col=colorMapS
            );
}

dev.off()
## If the algorithm will print this it means it could run all the functions thus the execution is successfull
print(args[4])