# TrAnSys
# Elaborado por Itamar José G. Nunes et al.

######################################################################################
# Funções para carregar amostras do Illumina

######################
# Argumentos iniciais e sub-argumentos:
# 1) matrixFile: Nome do arquivo de matriz
# 2) annotFile: Nome do arquivo de anotação, ou "?" se não houver arquivo
# 3) skipLines: Número de linhas a serem puladas na matriz principal (geralmente varia entre 0 ou 1)
# 4) idCol: Nome da coluna de ID da sonda, pois pode variar - padrão é "ID_REF"
# 5) normMethod: "quantile", "qspline", "vsn", "rankInvariant", "rsn", "median" ou "none"
# 6) transfMethod: "none", "log2", "neqc" ou "vst"
######################

######################
# Objetos obtidos:
# 1) SeriesMatrix
# 2) SeriesExpressionSet
# 3) PlatformMatrix (Se annotFile != "?")
# 4) PlatformHeader (Se annotFile != "?")
######################

# --> Possibilidades iniciais:
# - readBeadSummaryData:
# skip = número de linhas a serem puladas até chegar a tabela. O mesmo vale pro BGX
# ProbeID = "ID_REF", nome da primeira coluna
# method = "quantile", "qspline", "vsn", "rankInvariant", "rsn", "median" ou "none"
# transform = "none", "log2", "neqc" ou "vst"
# T = vetor de distribuição quando method="rankInvariant". Se nulo, é usada a média
# NOTA: O method = "vsn" já faz log2 dos valores, logo a transform deve ser obrigatoriamente "none"
# NOTA2: A documentação diz transform = "rsn", mas na verdade é method = "rsn"!!
# NOTA3: Quando transform = "neqc", as sondas de controle são removidas
# NOTA4: Mais pra frente, ver como funciona o lance do rankinvariant


library("beadarray")
library("illuminaio")

readIlluminaTextSamplesGEAP <- function (dataFiles)
{
  sep = "\t"
  quote = ""
  dec = "."
  qc.columns = list(exprs = "AVG_Signal", 
                    se.exprs = "BEAD_STDERR", nObservations = "Avg_NBEADS", 
                    Detection = "Detection Pval")
  columns = list(exprs = "AVG_Signal", 
                 se.exprs = "BEAD_STDERR", nObservations = "Avg_NBEADS", 
                 Detection = "Detection Pval")
  skip = 8
  qc.sep = "\t"
  qcFile = NULL
  sampleSheet = NULL
  annoCols = c("TargetID", "PROBE_ID", "SYMBOL")
  qc.skip = 8
  controlID = "ProbeID"
  ProbeID = "ProbeID"
  illuminaAnnotation = NULL
  
  if (!(is.null(sampleSheet))) {
    samples = read.table(sampleSheet, sep = ",", header = TRUE, 
                         skip = 7, as.is = TRUE)
  }
  r = NULL
  for(f in dataFiles)
  {
    hdat = readLines(f, n = 10)
    headIndex = grep(pattern = "^([^\t\r\n]*?ID[^\t\r\n]*?)\t", hdat, ignore.case = T)[1]
    skipLines = headIndex - 1
    idCol = gsub(pattern = "^([^\t\r\n]*?ID[^\t\r\n]*?)\t.+", replacement = "\\1", x =  hdat[headIndex], perl=TRUE, ignore.case = T);
    ProbeID = idCol
    pvalCol = gsub(pattern = ".+?\t[^\t\r\n]*?(Detection[.\\ ]Pval[^\t\r\n]*?).*", replacement = "\\1", x =  hdat[headIndex], perl=TRUE, ignore.case = T);
    columns$Detection = pvalCol
    currTable = read.table(f, sep = sep, header = TRUE, 
                           skip = skipLines, dec = dec, quote = quote, as.is = TRUE, 
                           row.names = 1, check.names = FALSE, strip.white = TRUE, 
                           comment.char = "", fill = TRUE)
    if (is.null(r))
    {
      r = currTable
    } else if (nrow(r) == nrow(currTable)) {
      r = cbind(r, currTable)
    }
  }
  r[, ProbeID] = rownames(r)
  index = grep(ProbeID, colnames(r))
  annoCols = match(annoCols, colnames(r))
  annoCols = annoCols[!is.na(annoCols)]
  annoMat = r[, annoCols, drop = FALSE]
  if (length(index) != 0) {
    ProbeID = r[, index]
  }
  else {
    stop("Could not find a column called ", ProbeID, " to use as bead identifiers.  Check your file and try changing the 'ProbeID' and/or 'skip' arguments.")
  }
  if (length(ProbeID) != length(unique(ProbeID))) {
    notdup = !duplicated(ProbeID)
    warning("ProbeIDs non-unique: consider setting 'ProbeID' to another column containing unique identifiers. ", 
            sum(!notdup), " repeated entries have been removed.\n")
    ProbeID = ProbeID[notdup]
    r = r[notdup, ]
    if (!is.na(annoCols)) 
      annoMat = r[notdup, annoCols, drop = FALSE]
  }
  data = index = list()
  ncols = NULL
  nrows = nrow(r)
  for (i in 1:length(columns)) {
    index[[i]] = grep(columns[[i]], colnames(r))
    ncols[i] = length(index[[i]])
    if (ncols[i] == 0) {
      cat("Could not find a column called: ", columns[[i]], 
          "\n")
    }
  }
  if (sum(ncols) == 0) {
    stop("No data found, check your file or try changing the 'skip' argument")
  }
  i = seq(1:length(ncols))[ncols == max(ncols)][1]
  defColNames = sub(paste("(.|)", columns[[i]], "(.|)", sep = ""), 
                    "", colnames(r)[index[[i]]])
  for (i in 1:length(columns)) {
    if (ncols[i] == max(ncols)) {
      data[[i]] = r[, index[[i]]]
      colNames = sub(paste("(.|)", columns[[i]], "(.|)", 
                           sep = ""), "", colnames(r)[index[[i]]])
      dupColNames = unique(colNames[duplicated(colNames)])
      if (length(dupColNames) != 0) {
        for (j in 1:length(dupColNames)) {
          sel = colNames == dupColNames[j]
          colNames[sel] = paste(colNames[sel], ".rep", 
                                seq(1:sum(sel)), sep = "")
        }
      }
      colnames(data[[i]]) = colNames
    }
    else {
      data[[i]] = matrix(NA, nrows, max(ncols))
      colnames(data[[i]]) = defColNames
    }
    rownames(data[[i]]) = ProbeID
  }
  names(data) = names(columns)
  BSData = new("ExpressionSetIllumina")
  if (!is.null(illuminaAnnotation) && is.character(illuminaAnnotation)) 
    BSData@annotation = illuminaAnnotation
  for (i in 1:length(data)) {
    index = which(names(assayData(BSData)) == names(data)[i])
    if (ncols[i] == 0) {
      cat("Missing data - NAs stored in slot", names(data)[i], 
          "\n")
    }
    assayData(BSData)[[index]] = as.matrix(data[[i]])
  }
  if (!(is.null(qcFile))) {
    QC = readQC(file = qcFile, sep = qc.sep, skip = qc.skip, 
                columns = qc.columns, controlID = controlID, dec = dec, 
                quote = quote)
    if (ncol(QC$exprs) != ncol(exprs(BSData))) {
      warning("Number of arrays doesn't agree: ", ncol(exprs(BSData)), 
              " in dataFile, versus ", ncol(QC$exprs), " in qcFile.  qcFile ignored.")
    }
    else {
      reorder = match(colnames(QC[[i]]), colnames(exprs(BSData)))
      notagree = colnames(QC$exprs) != colnames(exprs(BSData))
      if (sum(notagree) == 0) {
        for (i in 1:length(QC)) {
          slotMatch = match(names(QC)[i], names(assayData(BSData)))
          if (!is.na(slotMatch)) {
            if (ncol(QC[[i]]) == ncol(assayData(BSData)[[slotMatch]])) {
              assayData(BSData)[[slotMatch]] = rbind(assayData(BSData)[[slotMatch]], 
                                                     QC[[i]])
              dupIDs = which(duplicated(rownames(assayData(BSData)[[slotMatch]])))
              if (length(dupIDs) > 0) {
                rownames(assayData(BSData)[[slotMatch]])[dupIDs] = paste(rownames(assayData(BSData)[[slotMatch]])[dupIDs], 
                                                                         ".2", sep = "")
              }
            }
          }
        }
      }
      else {
        if (length(reorder) != 0) {
          for (i in 1:length(BSData@QC)) {
            reorder = sapply(colnames(QC[[i]]), FUN = "grep", 
                             colnames(exprs(BSData)), fixed = TRUE)
            if (length(reorder) > 0) {
              QC[[i]] = QC[[i]][, reorder]
              slotMatch = match(names(QC)[i], names(assayData(BSData))[i])
              if (!is.na(slotMatch)) {
                if (ncol(QC[[i]]) == ncol(assayData(BSData)[[slotMatch]])) {
                  assayData(BSData)[[slotMatch]] = rbind(assayData(BSData)[[slotMatch]], 
                                                         QC[[i]])
                  dupIDs = which(duplicated(rownames(assayData(BSData)[[slotMatch]])))
                  if (length(dupIDs) > 0) {
                    rownames(assayData(BSData)[[slotMatch]])[dupIDs] = paste(rownames(assayData(BSData)[[slotMatch]])[dupIDs], 
                                                                             ".2", sep = "")
                  }
                }
              }
            }
          }
        }
        else {
          warning("Could not match array names used in dataFile with those in qcFile.  qcFile ignored.")
        }
      }
    }
  }
  else QC = NULL
  if (!(is.null(sampleSheet))) {
    colmatch = grep(colnames(exprs(BSData)), samples, fixed = TRUE)
    ord = match(colnames(exprs(BSData)), samples[, colmatch])
    if (length(colmatch) == 1 && sum(is.na(ord)) == 0) {
      samples = samples[ord, ]
      rownames(samples) = colnames(exprs(BSData))
      p = new("AnnotatedDataFrame", samples, data.frame(labelDescription = colnames(samples), 
                                                        row.names = colnames(samples)))
    }
    else {
      warning("Could not reconcile dataFile with sampleSheet information. sampleSheet ignored.")
      p = new("AnnotatedDataFrame", data.frame(sampleID = colnames(exprs(BSData)), 
                                               row.names = colnames(exprs(BSData))))
    }
  }
  else {
    p = new("AnnotatedDataFrame", data.frame(sampleID = colnames(exprs(BSData)), 
                                             row.names = colnames(exprs(BSData))))
  }
  Status = rep("regular", length(ProbeID))
  if (!is.null(QC)) {
    Type = QC$Type
    newIDs = rownames(QC[[1]])
    ProbeID = c(ProbeID, newIDs)
    dupIDs = which(duplicated(ProbeID))
    newAnno = matrix(nrow = length(newIDs), ncol = ncol(annoMat), 
                     NA)
    colnames(newAnno) = colnames(annoMat)
    annoMat = rbind(annoMat, newAnno)
    Status = c(Status, Type)
    if (length(dupIDs) > 0) {
      ProbeID[dupIDs] = paste(ProbeID[dupIDs], ".2", sep = "")
    }
  }
  featureData <- new("AnnotatedDataFrame", data = data.frame(ProbeID, 
                                                             annoMat, row.names = ProbeID, Status = Status))
  phenoData(BSData) <- p
  featureData(BSData) <- featureData
  BSData@channelData[[1]] <- rep("G", length(sampleNames(BSData)))
  return(BSData)
}



readIlluminaMatrices <- function(files)
{
  eset = NULL
  for(f in files)
  {
    hdat = readLines(f, n = 10)
    headIndex = grep(pattern = "^([^\t\r\n]*?ID[^\t\r\n]*?)\t", hdat, ignore.case = T)[1]
    skipLines = headIndex - 1
    idCol = gsub(pattern = "^([^\t\r\n]*?ID[^\t\r\n]*?)\t.+", replacement = "\\1", x =  hdat[headIndex], perl=TRUE, ignore.case = T);
    pvalCol = gsub(pattern = ".+?\t[^\t\r\n]*?(Detection[.\\ \\_]Pval[^\t\r\n]*?).*", replacement = "\\1", x =  hdat[headIndex], perl=TRUE, ignore.case = T)
    pcols = list(exprs="AVG_Signal", se.exprs="BEAD_STDERR", 
                               nObservations="Avg_NBEADS", Detection=pvalCol)
    curreset = readBeadSummaryData(dataFile = f, skip = skipLines, ProbeID = idCol, columns = pcols)
    if (is.null(eset))
    {
      eset = curreset
    } else {
      eset = combineTwoExpressionSets(eset, curreset)
    }
  }
  return(eset)
}

readIlluminaBGX <- function(file)
{
  bgx <- readBGX(file)
  bgx <- bgx$probes
  rownames(bgx) <- bgx$Probe_Id
  return(bgx)
}

readGEAPIDAT <- function(idatFiles, bgxFile, bgMethod = "auto", normExpMethod = "rma", normMethod = "quantile", cyclicMethod = "fast", optionMA = "A")
{
  rawdat = read.idatTrAnSys(idatfiles = idatFiles, bgxfile = bgxFile)
  colnames(rawdat$E) <- sub(pattern = ".*/", x = colnames(rawdat$E), replacement = "")
  probeFilter <- which(duplicated(rawdat$genes$Array_Address_Id))
  platmat <- rawdat$genes[-probeFilter,]
  rownames(platmat) <- platmat[, "Probe_Id"]
  eset = treatLimma(rawdat, bgMethod, normExpMethod, normMethod, cyclicMethod, optionMA)
  eset = insertGPL(platmat, eset)
  return(eset)
}
