# TrAnSys
# Elaborado por Itamar José G. Nunes et al.

######################################################################################
# -> Funções que carregam os datasets do GEO
#-------------------------------------------------

# Carrega os pacotes:

library("GEOquery")


GSEreadMatrices <- function(fnames)
{
  finalDf = NULL
  totalLineCount = 0
  for(file in c(fnames))
  {
    dat = readLines(file)
    startIndex = grep(pattern = "^[!]series_matrix_table_begin", x = dat)
    lastIndex = grep(pattern = "^[!]series_matrix_table_end", x = dat) - 2
    lineCount = lastIndex - startIndex
    if (!is.null(finalDf) && lineCount != totalLineCount) { next }
    datamat = read.delim(file, row.names = 1, skip = startIndex, nrows = lineCount, check.names = F)
    if (is.null(finalDf))
    {
      finalDf = datamat
      totalLineCount = lineCount
    } else {
      finalDf[,names(datamat)] = datamat
    }
  }
  eset = new("ExpressionSet", exprs = as.matrix(finalDf))
  return(eset)
}


GEOpreprocess <- function (fname, exclude = NULL, include = NULL)
{
  dat <- readLines(fname)
  nseries <- sum(grepl("^!Series_", dat))
  nsamples <- sum(grepl("^!Sample_", dat))
  con <- environment(parseGEO)$fileOpen(fname)
  header <- suppressWarnings(read.table(con, sep = "\t", header = FALSE, 
                                        nrows = nseries, allowEscapes = TRUE))
  tmpdat <- suppressWarnings(read.table(con, sep = "\t", header = FALSE, 
                                        nrows = nsamples, allowEscapes = TRUE,
                                        blank.lines.skip = TRUE, skipNul = TRUE))
  tmptmp <- t(tmpdat)
  sampledat <- rbind(data.frame(), tmptmp[-1, ])
  colnames(sampledat) <- make.unique(sub("!Sample_", "", as.character(tmpdat[, 
                                                                             1])))
  suppressWarnings(readLines(con, 1))
  datamat <- suppressWarnings(read.delim(con, sep = "\t", header = TRUE, 
                                         na.strings = c("NA", "null", "NULL", "Null"), comment.char = ""))
  close(con)
  tmprownames = datamat[, 1]
  datamat <- as.matrix(datamat[!is.na(tmprownames), -1])
  rownames(datamat) <- tmprownames[!is.na(tmprownames)]
  if (nrow(datamat) == 1) {
    datamat <- datamat[1:(nrow(datamat) - 1), ]
  } else {
    datamat <- as.matrix(datamat[1:(nrow(datamat) - 1), ])
  }
  rownames(sampledat) <- colnames(datamat)
  # Começa aqui código alterado 1
  excmode<-"None"
  incmode<-"None"
  if (is.vector(exclude, mode="numeric")&(length(exclude) != 0)) {
    exc<-(exclude * -1)
    sampledat<-sampledat[exc,,1]
    excmode<-"Numeric"
    if(is.null(include) == FALSE){warning("Both 'exclude' and 'include' inputs are not empty. Therefore, only the 'exclude' input was taken into account")}
    include<-NULL
  } else if (is.vector(exclude, mode="character")&(length(exclude) != 0)){
    exc<-as.vector(unlist(sapply(exclude, FUN=grep, x=as.vector(tmptmp[,2])[-1],ignore.case=FALSE)))    
    if (length(exc > 0)) {
      sampledat<-sampledat[(exc*-1),,1]
      excmode<-"Char"
      if(is.null(include) == FALSE){warning("Both 'exclude' and 'include' inputs are not empty. Therefore, only the 'exclude' input was taken into account")}
      include<-NULL
    }
  }
  
  if (is.vector(include, mode="numeric")&(length(include) != 0)) {
    inc<-include
    sampledat<-sampledat[inc,,1]
    incmode<-"Numeric"
  } else if (is.vector(include, mode="character")&(length(include) != 0)) {
    inc<-as.vector(unlist(sapply(include, FUN=grep, x=as.vector(tmptmp[,2])[-1],ignore.case=FALSE)))    
    if (length(inc > 0)) {
      sampledat<-sampledat[(inc),,1]
      incmode<-"Char"
    }
  }
  
  # Termina aqui o código alterado 1
  
  GPL = as.character(sampledat[1, grep("platform_id", colnames(sampledat), 
                                       ignore.case = TRUE)])
  fd = new("AnnotatedDataFrame", data = data.frame(row.names = rownames(datamat)))
  
  # Começa aqui código alterado 2
  
  if (excmode == "Numeric") {
    datamat<-datamat[,exc]    
  } else if (excmode == "Char") {
    datamat<-datamat[,(exc*-1)]
  }
  if (incmode == "Numeric") {
    datamat<-datamat[,inc]    
  } else if (incmode == "Char") {
    datamat<-datamat[,inc]
  }
  
  # Termina aqui o código alterado 2
  
  if (is.null(nrow(datamat))) {
    tmpnames <- names(datamat)
    rownames(sampledat) <- tmpnames
    datamat = matrix(nrow = 0, ncol = nrow(sampledat))
    colnames(datamat) <- tmpnames
  } else {
    rownames(datamat) <- rownames(dat)
  }
  
  eset <- new("ExpressionSet", phenoData = as(sampledat, "AnnotatedDataFrame"), 
              annotation = GPL, featureData = fd, exprs = as.matrix(datamat))
  return(list(GPL = as.character(sampledat[1, grep("platform_id", 
                                                   colnames(sampledat), ignore.case = TRUE)]), eset = eset, header = header))
}



GEOpreprocessGSM <- function (fnames)
{ # TODO: dar trim nas infos, tão com espaço ainda nas bordas
  phenoDatas <- NULL
  valuesDatas <- NULL
  for (i in 1:length(fnames)) {
      dat <- readLines(fnames[i])
      nsamples <- sum(grepl("^!Sample_", dat))
      ncolDescriptions <- sum(grepl("^#", dat))
      con <- environment(parseGEO)$fileOpen(fnames[i])
      suppressWarnings(readLines(con, 1))
      tmpdat <- suppressWarnings(read.table(con, sep = "=", header = FALSE, strip.white = TRUE, 
                                            nrows = nsamples, quote = "", allowEscapes = FALSE,
                                            blank.lines.skip = TRUE, skipNul = TRUE))
      tmptmp <- t(tmpdat)
      sampledat <- rbind(data.frame(), tmptmp[-1, ])
      colnames(sampledat) <- make.unique(sub("^!Sample_", "", as.character(tmpdat[, 
                                                                                 1])))
      datamat <- suppressWarnings(read.delim(con, sep = "\t", strip.white = TRUE, skip = ncolDescriptions, header = TRUE, 
                                             na.strings = c("NA", "null", "NULL", "Null"), comment.char = "!"))
      close(con)
      tmprownames = datamat[, 1]
      datamat <- as.matrix(datamat[!is.na(tmprownames), -1])
      rownames(datamat) <- tmprownames[!is.na(tmprownames)]
      if (nrow(datamat) == 1) {
        datamat <- datamat[1:(nrow(datamat) - 1), ]
      } else {
        datamat <- as.matrix(datamat[1:(nrow(datamat) - 1), ])
      }
      rownames(sampledat) <- colnames(datamat)
      colnames(datamat) <- sampledat$geo_accession
      if (is.null(phenoDatas)){
        phenoDatas <- sampledat
      } else {
        phenoDatas <- rbind(phenoDatas, sampledat)
      }
      if (is.null(valuesDatas)) {
        valuesDatas <- datamat
      } else if (nrow(valuesDatas) == nrow(datamat)) {
        valuesDatas <- cbind(valuesDatas, datamat)
      }
  }
  
  if (is.null(nrow(valuesDatas))) {
    tmpnames <- names(valuesDatas)
    rownames(phenoDatas) <- tmpnames
    valuesDatas = matrix(nrow = 0, ncol = nrow(phenoDatas))
    colnames(valuesDatas) <- tmpnames
  } else {
    rownames(phenoDatas) <- colnames(valuesDatas)
  }
  
  GPL <- phenoDatas$platform_id[1]
  fd = new("AnnotatedDataFrame", data = data.frame(row.names = rownames(valuesDatas)))
  
  return (new("ExpressionSet", phenoData = as(phenoDatas, "AnnotatedDataFrame"), 
              annotation = GPL, featureData = fd, exprs = as.matrix(valuesDatas)))
}


GEOpreprocessGPL <- function (fname, exclude = NULL, include = NULL)
{
  dat <- suppressWarnings(readLines(fname, skipNul = TRUE))
  infoRowIndexes <- grep("^!Platform_(?!relation|sample_id|series_id)\\w*? = [^\t\r\n]", dat, perl = T)
  header <- data.frame(do.call(rbind, strsplit(x = dat[infoRowIndexes], split = " = ", fixed=TRUE)))
  #colInfoIndexes <- grep("^#[^\t\r\n]*? = [^\t\r\n]*?$", dat, perl = T)
  limits <- grep("^!platform_table_(?:begin|end)", dat, perl = T)
  tmpdat <- dat[(limits[1]+1):(limits[2]-1)]
  datamat <- suppressWarnings(data.frame(do.call(rbind, strsplit(tmpdat[-1], "\t", fixed=TRUE)), row.names = 1))
  cols <- strsplit(x = tmpdat[1], split = "\t", fixed=TRUE)[[1]][-1]
  colnames(datamat) <- cols
  return(list(dt = datamat, header = header))
}


readGPL <- function (fname) {
  dat <- suppressWarnings(readLines(fname, skipNul = TRUE)) 
  startIndex <- grep(pattern = "^[!]platform_table_begin", x = dat)
  endIndex <- grep(pattern = "^[!]platform_table_end", x = dat) - startIndex - 2
  con <- environment(parseGEO)$fileOpen(fname)
  readLines(con = con, skipNul = TRUE, n = startIndex)
  datamat <- read.table(con, header = TRUE, sep = "\t", allowEscapes = TRUE, strip.white = TRUE, skipNul = TRUE,
                       nrows = endIndex[1], quote = "")
  colnames(datamat) <- gsub(pattern = "\\.", x = colnames(datamat), replacement = " ")
  close(con)
  return (datamat)
}


insertGPL <- function(gplMatrix, exSet) {
  dat = gplMatrix
  sameProbes = intersect(rownames(gplMatrix), rownames(exprs(exSet)))
  if (length(sameProbes) == 0) { return(exSet) }
  newdat = data.frame(row.names = rownames(exprs(exSet)))
  rowCount = nrow(newdat)
  for(c in colnames(gplMatrix))
  {
    newdat[,c] = rep("", rowCount)
    newdat[sameProbes, c] = dat[sameProbes,c]
  }
  vmd <- colnames(newdat)
  fd <- new("AnnotatedDataFrame", data = newdat, varMetadata = data.frame(vmd, row.names = vmd))
  featureData(exSet) <- fd
  return (exSet)
}



readFilesSOFT <- function(files)
{
  readFirstLine = function(fname)
  {
    con = file(fname,"r")
    firstLine = readLines(con,n=1)
    close(con)
    return(firstLine)
  }
  verbose = exists("giveStatus", mode = "function")
  valDfs = list()
  annotDf = NULL
  finalDataMat <- data.frame()
  fi = 0
  fCount = length(files)
  for (f in files)
  {
    fi = fi + 1
    if (verbose) { giveStatus(percent = (fi * 100 / fCount),
                              message = paste0("Lendo arquivos SOFT (", fi, " de ", fCount, "): ", sub(pattern = ".*[/\\]", replacement = '', x = f))) }
    fline = readFirstLine(f)
    dat = readLines(f)
    if (fline == "^Annotation")
    {
      fline = dat[grep(pattern = "^[!]Annotation_platform\ =\ GPL", x = dat)]
    }
    geocode = strsplit(fline, " = ")[[1]][2]
    typecode = substr(geocode, 1, 3)
    if (typecode == "GSM") { tpref = "sample"
    } else if (typecode == "GPL") { tpref = "platform"
    } else { tpref = "series" }
    #con = environment(parseGEO)$fileOpen(f)
    startIndex = grep(pattern = sprintf("^[!]%s_table_begin", tpref), x = dat)
    lineCount = grep(pattern = sprintf("^[!]%s_table_end", tpref), x = dat) - startIndex - 2
    datamat = read.delim(f, skip = startIndex, nrows = lineCount, row.names = 1, check.names = F)
    if (typecode == "GPL") { annotDf = datamat } else {
      if (length(valDfs) == 0 || (nrow(datamat) == nrow(valDfs[[1]])) ) { valDfs[[geocode]] = datamat }
    }
  }
  if (length(valDfs) > 0)
  {
    valuesDf = data.frame(row.names = rownames(valDfs[[1]]))
    for (df in names(valDfs))
    {
      if (typecode == "GSE") { valuesDf[, colnames(valDfs[[df]])] = valDfs[[df]] } else { valuesDf[, df] = valDfs[[df]][,1] }
    }
    finalDataMat = valuesDf
  }
  if (!is.null(annotDf))
  {
    if (length(valDfs) > 0)
    {
      finalDataMat = merge(x = finalDataMat, y = annotDf, by = "row.names", all.x=T)
      rownames(finalDataMat) = finalDataMat[,1]
      finalDataMat = finalDataMat[,-1]
    } else { finalDataMat = annotDf }
  }
  return(finalDataMat)
}

txtToExpressionSet <- function(file)
{
  dat = read.delim(file = file, header = T, row.names = 1)
  return(dfToExpressionSet(dat))
}

setOrMerge <- function(objName, ...)
{
  insertObjs = list(...)
  if (length(insertObjs) > 0)
  {
    insNames = 1:length(insertObjs)
    currObj = NULL
    if (exists(objName))
    {
      currObj = get(objName)
      insertObjs[[length(insertObjs) + 1]] = currObj
    } else {
      currObj = insertObjs[[1]]
      insNames = insNames[-1]
    }
    for(nm in insNames)
    {
      
      commonNames = intersect(rownames(currObj), rownames(insertObjs[[nm]]))
      if (length(commonNames) < (nrow(currObj) / 2)) { next }
      currObj = merge(currObj, insertObjs[[nm]], by = 'row.names')
      rownames(currObj) = currObj[,1]
      currObj = currObj[,-1]
    }
    assign(x = objName, value = as.data.frame(currObj), envir = globalenv())
  }
}

setOrMergeESets <- function(objName, ...)
{
  insertObjs = list(...)
  if (length(insertObjs) > 0)
  {
    currObjAsESet = function(cobj)
    {
      if (is.matrix(cobj) || is.data.frame(cobj))
      {
        return(dfToExpressionSet(as.data.frame(cobj)))
      }
      return(cobj)
    }
    insNames = 1:length(insertObjs)
    currObj = NULL
    if (exists(objName))
    {
      currObj = get(objName)
      currObj = currObjAsESet(currObj)
      insertObjs[[length(insertObjs) + 1]] = currObj
    } else {
      currObj = insertObjs[[1]]
      currObj = currObjAsESet(currObj)
      insNames = insNames[-1]
    }
    for(nm in insNames)
    {
      insertObjs[[nm]] = currObjAsESet(insertObjs[[nm]])
      currObj = combineTwoExpressionSets(currObj, insertObjs[[nm]])
    }
    assign(x = objName, value = currObj, envir = globalenv())
  }
}


dfToExpressionSet <- function(df)
{
  library("GEOquery")
  isValType <- function(c){ return(class(df[,c]) %in% c("numeric", "integer")) }
  colOrder <- sapply(colnames(df), isValType)
  valCols <- colnames(df)[colOrder]
  attrCols <- colnames(df)[!colOrder]
  matValues <- as.matrix(subset(x = df, select=valCols))
  eset <- new("ExpressionSet", exprs = matValues)
  attributes(eset)$hasPlatform = (length(attrCols) > 0)
  if (length(attrCols) > 0)
  {
    dat <- subset(x=df, select=attrCols)
    vmd <- names(dat)
    featureData(eset) <- new("AnnotatedDataFrame", data = dat, varMetadata = data.frame(vmd, row.names = vmd))
  }
  return(eset)
}

hasPlatform <- function(eset)
{
  return(length(eset@featureData@data) > 0)
}

combineTwoExpressionSets <- function(esetA, esetB)
{
  if (is.null(esetA)) { return(esetB) }
  if (is.null(esetB)) { return(esetA) }
  sameProbes = intersect(rownames(exprs(esetA)), rownames(exprs(esetB)))
  dataNames = intersect(names(esetA@assayData), names(esetB@assayData))
  if (length(sameProbes) == 0 || length(dataNames) == 0) { return(esetA) }
  locEnv = new.env()
  for(dname in dataNames)
  {
    if (nrow(esetA@assayData[[dname]]) == nrow(esetB@assayData[[dname]]))
    {
      newMat = cbind(esetA@assayData[[dname]], esetB@assayData[[dname]])
    } else {
      newMat = merge(esetA@assayData[[dname]], esetB@assayData[[dname]], by='row.names')
      rownames(newMat) = newMat[,1]
      newMat = as.matrix(newMat[,-1])
    }
    locEnv[[dname]] = newMat
  }
  eset = new('ExpressionSet', assayData = locEnv)
  return(eset)
}

copyExpressionSet <- function(eSetSrc)
{
  sameProbes = rownames(exprs(eSetSrc))
  dataNames = names(eSetSrc@assayData)
  locEnv = new.env()
  for(dname in dataNames)
  {
    newMat = eSetSrc@assayData[[dname]][sameProbes,]
    locEnv[[dname]] = newMat
  }
  eset = new('ExpressionSet', assayData = locEnv)
  return(eset)
}

setGPLInSessionIfNotExists <- function(eset, gplName)
{
  if (!exists(gplName) && .hasSlot(eset, 'featureData') && dim(eset@featureData)[[2]] > 0)
  {
    assign(x = gplName, value = eset@featureData@data, envir = globalenv())
    return(T)
  }
  return(F)
}

writeRaw <- function(raw, file)
{
  if (class(raw)[[1]] == 'AffyBatch')
  {
    writeMatrix(exprs(raw), file)
  } else if (class(raw)[[1]] %in% c('ExpressionSet', 'ExpressionSetIllumina')) {
    for(nm in names(raw@assayData))
    {
      write.table(x = paste0('\n=> ', nm, '\n'), file = file, append = T, sep = '\t', row.names = F, col.names = F, quote = F)
      writeMatrix(as.data.frame(raw@assayData[[nm]]), file)
    }
  } else if (class(raw)[[1]] == 'RGList') {
    for(nm in names(raw))
    {
      if (class(raw[[nm]]) %in% c('data.frame', 'matrix'))
      {
        write.table(x = paste0('\n=> ', nm, '\n'), file = file, append = T, sep = '\t', row.names = F, col.names = F, quote = F)
        writeMatrix(as.data.frame(raw[[nm]]), file)
      }
    }
  }
}

writeMatrix <- function(mat, file)
{
  wMat = cbind(data.frame(ID = rownames(mat)), mat)
  write.table(x = wMat, file = file, append = T, sep = '\t', row.names = F, col.names = T, quote = F)
}

# TODO: Função de unir múltiplos GSEs!!