#----Supplementary Work for the Thesis of Clifford Yacas -----
#----in fullfillment for M.Sc in Statistics----
#----Copyrighted 2020---


#---- Chapter 3 Methodology ----
#---- creating a function similar to 'preprocessFunNorm' ----

# need minfi package for supplementary functions for 450K data
# need pcaPP package for faster computation of Kendall's Tau
# need VineCopula package to calculate parameter of copula from 
# Kendall's Tau
# need pls package for efficient partial least squares computation
library(minfi)
library(pcaPP)
library(copula)
library(VineCopula)
library(pls)

# Main Function
# We note that many of the functions are either directly 
# from the original FunNorm function.
# We also note that the functions we've created are conceptually based
# on original FunNorm but have been changed in a way to implement 
# our ideas in Chapter 3.
preprocessFunnorm.bivar <- function(rgSet, ncomp=3, sex = NULL, bgCorr = TRUE, dyeCorr = TRUE, keepCN = TRUE, ratioConvert = TRUE, verbose = TRUE) {
  
  # makes sure that object is minfi-backed
  .isMatrixBackedOrStop(rgSet, "preprocessFunnorm.bivar")
  
  # makes sure that the object is'RGChannelSet' or 'RGChannelSetExtended'
  .isRGOrStop(rgSet)
  rgSet <- updateObject(rgSet) 
  
  # Background correction and dye bias normalization via noob
  # (preprocessNoob() function):
  if (bgCorr){
    if(verbose && dyeCorr) {
      message("[preprocessFunnorm.bivar] Background and dye bias correction with noob")
    } else {
      message("[preprocessFunnorm.bivar] Background correction with noob")
    }
    gmSet <- preprocessNoob(rgSet, dyeCorr = dyeCorr)
    if(verbose) message("[preprocessFunnorm.bivar] Mapping to genome")
    gmSet <- mapToGenome(gmSet)
  } else {
    if(verbose) message("[preprocessFunnorm.bivar] Mapping to genome")
    gmSet <- mapToGenome(rgSet)
  }
  # tells the user if the inner function should verbose
  subverbose <- max(as.integer(verbose) - 1L, 0)
  
  if(verbose) message("[preprocessFunnorm.bivar] Quantile extraction")
  
  # extraction of control probes from data as described by 
  # Fortin et al. (2014)
  extractedData <- .extractFromRGSet450k(rgSet)
  rm(rgSet)
  
  # adds sex vector if not specified before
  if (is.null(sex)) {
    gmSet <- addSex(gmSet, getSex(gmSet, cutoff = -3))
    sex <- rep(1L, length(gmSet$predictedSex))
    sex[gmSet$predictedSex == "F"] <- 2L
  }
  
  if(verbose) message("[preprocessFunnorm.bivar] Normalization")
  
  # Gets A matrix of copy number values of the data
  # Copy number variation is a phenomenon in which 
  # sections of the genome are repeated and the number 
  # of repeats in the genome varies between individuals.
  # Needed to convert data to ratio data. In our case,
  # to Beta-values
  if(keepCN) {
    CN <- getCN(gmSet)
  }
  
  # The main function that does our quantile normalization process
  # specified in the thesis in Chapter 3
  gmSet <- .normalizeFunnorm450k.bivar(object = gmSet, 
                                       extractedData = extractedData, 
                                       ncomp = ncomp, sex=sex, verbose = subverbose)
  
  # Stores preprocess method used.
  preprocessMethod <- c(preprocessMethod(gmSet),
                        mu.norm = sprintf("Funnorm.bivar, ncomp=%s", ncomp))
  
  # Gets Beta-values if specified
  if(ratioConvert) {
    grSet <- ratioConvert(gmSet, type = "Illumina", keepCN = keepCN)
    if(keepCN) {
      assay(grSet, "CN") <- CN
    }
    grSet@preprocessMethod <- preprocessMethod
    return(grSet)
  } else {
    gmSet@preprocessMethod <- preprocessMethod
    return(gmSet)
  }
}

#####   Functions and Helper Functions   #####

# Gets the indices of the probes belonging in either
# Type I Green, Type I Red, Type II and Sex chromosomes
.getFunnormIndices <- function(object) {
  .isGenomicOrStop(object)
  probeType <- getProbeType(object, withColor = TRUE)
  autosomal <- (seqnames(object) %in% paste0("chr", 1:22))
  indices <- list(IGrn = which(probeType == "IGrn" && autosomal),
                  IRed = which(probeType == "IRed" && autosomal),
                  II = which(probeType == "II" && autosomal),
                  X = which(seqnames(object) == "chrX"),
                  Y = which(seqnames(object) == "chrY"))
  indices
}


# The main function that does our normalization process 
# specified in Chapter 3
.normalizeFunnorm450k.bivar <- function(object, extractedData, ncomp, sex, verbose = FALSE) {
  normalizeQuantiles.bivar <- function(Meth, Unmeth, indices, sex = NULL) {
    # Get the methylated and unmethylated probe signals via
    # probe type
    Meth1 <- Meth[indices,,drop=FALSE]
    Unmeth1 <- Unmeth[indices,,drop=FALSE]
    # The old bivariate quantiles of the data as specified in the thesis.
    # We use the Frank copula and use v = u for the copula C(u,v).
    oldQuantiles <- .bivariate.quantiles.frank.final(Meth = Meth1, Unmeth = Unmeth1, 
                                                     alpha = probs)
    
    # regressing out unwanted variation by PLS regression
    # for either the autosomes or sex chromosomes
    if(is.null(sex)) {
      newQuantiles <- .returnNewQuantiles(controlMatrix = model.matrix, 
                                          quantiles = oldQuantiles, ncomp = ncomp)
    } else {
      newQuantiles <- .returnFitBySex.bivar(controlMatrix = model.matrix, 
                                            quantiles = oldQuantiles, 
                                            ncomp = ncomp, sex = sex)
    }
    # This function was from the original FunNorm
    # where linear interpolation will be done between the quantiles
    # after regressing out the variation. 
    .normalizeMatrix(cbind(Meth1,Unmeth1), newQuantiles)
  }
  indicesList <- .getFunnormIndices(object)
  
  # The control matrix is built from the extracted data
  # by using the summary statistics defined in Fortin et al. (2014)
  # for the control probes
  model.matrix <- .buildControlMatrix450k(extractedData)
  
  # Remove the first and last quantile level since we set the 1st quantile
  # to be 0 and the last quantile to be the 499th quantile value + 1000
  probs <- seq(from = 0, to = 1, length.out = 500)[c(-1,-500)]
  Meth <- getMeth(object)
  Unmeth <- getUnmeth(object)
  Combined <- cbind(Meth,Unmeth)
  n = ncol(Meth)
  if (ncomp > 0){
    
    # autosomes normalization
    for (type in c("IGrn", "IRed", "II")) {
      indices <- indicesList[[type]]
      if(length(indices) > 0) {
        if(verbose) message(sprintf("[normalizeFunnorm450k.bivar] Normalization of the %s probes", type))
        Combined[indices,] <- normalizeQuantiles.bivar(Meth = Meth,
                                                       Unmeth = Unmeth, 
                                                       indices = indices, 
                                                       sex = NULL)
        
      }
    }
    # X-chrom normalization
    indices <- indicesList[["X"]]
    if(length(indices) > 0) {
      if(verbose) message("[normalizeFunnorm450k.bivar] Normalization of the X-chromosome")
      Combined[indices,] <- normalizeQuantiles.bivar(Meth =Meth,
                                                     Unmeth = Unmeth, 
                                                     indices = indices, 
                                                     sex = sex)
      #()
    }
  }
  #()
  
  Meth = Combined[,c(1:n)]
  Unmeth = Combined[,c((n+1):(2*n))]
  
  # Y-chrom normalization
  # For Y-chrom, we use quantile normalization defined by Bolstad et al. (2003)
  # This method is in the preprocessCore package but is imported with minfi
  indices <- indicesList[["Y"]]
  if(length(indices) > 0) {
    if(verbose) message("[normalizeFunnorm450k.bivar] Normalization of the Y-chromosome")
    sex <- as.character(sex)
    levels <- unique(sex)
    nSexes <- length(levels)
    #()
    if (nSexes == 2) {
      level1 <- levels[1]
      level2 <- levels[2]
    }
    if (nSexes == 2) {
      if (sum(sex == level1)>1) {
        Meth[indices, sex==level1]   <- preprocessCore::normalize.quantiles(Meth[indices, sex == level1, drop=FALSE])
        Unmeth[indices, sex==level1] <- preprocessCore::normalize.quantiles(Unmeth[indices, sex == level1,drop=FALSE])
      }
      if (sum(sex == level2)>1) {
        Meth[indices, sex==level2]   <- preprocessCore::normalize.quantiles(Meth[indices, sex == level2,drop=FALSE])
        Unmeth[indices, sex==level2] <- preprocessCore::normalize.quantiles(Unmeth[indices, sex == level2,drop=FALSE])
      }
    } else {
      Meth[indices,] <- preprocessCore::normalize.quantiles(Meth[indices,])
      Unmeth[indices,] <- preprocessCore::normalize.quantiles(Unmeth[indices,])
    }
  }
  
  assay(object, "Meth") <- Meth
  assay(object, "Unmeth") <- Unmeth
  return(object)
}



### To extract quantiles and control probes from rgSet
.extractFromRGSet450k <- function(rgSet) {
  rgSet <- updateObject(rgSet)
  controlType <- c("BISULFITE CONVERSION I",
                   "BISULFITE CONVERSION II",
                   "EXTENSION",
                   "HYBRIDIZATION",
                   "NEGATIVE",
                   "NON-POLYMORPHIC",
                   "NORM_A",
                   "NORM_C",
                   "NORM_G",
                   "NORM_T",
                   "SPECIFICITY I",
                   "SPECIFICITY II",
                   "TARGET REMOVAL",
                   "STAINING")
  #()
  array <- annotation(rgSet)[["array"]]
  #()
  ## controlAddr <- getControlAddress(rgSet, controlType = controlType, asList = TRUE)
  ctrls <- getProbeInfo(rgSet, type = "Control")
  #()
  if(!all(controlType %in% ctrls$Type))
    stop("The `rgSet` does not contain all necessary control probes")
  ctrlsList <- split(ctrls, ctrls$Type)[controlType]
  #()
  
  redControls <- getRed(rgSet)[ctrls$Address,,drop=FALSE]
  #()
  redControls <- lapply(ctrlsList, function(ctl) redControls[ctl$Address,,drop=FALSE])
  #()
  greenControls <- getGreen(rgSet)[ctrls$Address,,drop=FALSE]
  #()
  greenControls <- lapply(ctrlsList, function(ctl) greenControls[ctl$Address,,drop=FALSE])
  #()
  
  ## Extraction of the undefined negative control probes
  oobRaw <- getOOB(rgSet)
  probs <- c(0.01, 0.50, 0.99)
  greenOOB <- t(colQuantiles(oobRaw$Grn, na.rm = TRUE, probs = probs))
  redOOB   <- t(colQuantiles(oobRaw$Red, na.rm=TRUE,  probs = probs))
  oob      <- list(greenOOB = greenOOB, redOOB = redOOB)
  
  return(list(
    greenControls = greenControls,
    redControls = redControls,
    oob = oob, ctrlsList = ctrlsList,
    array = array))
}


## Extraction of the Control matrix from the control probes in extracted data
## These are the summary statistics of the control probes 
## as specified in Fortin et al. (2014)
.buildControlMatrix450k <- function(extractedData) {
  getCtrlsAddr <- function(exType, index) {
    ctrls <- ctrlsList[[index]]
    addr <- ctrls$Address
    names(addr) <- ctrls$ExtendedType
    na.omit(addr[exType])
  }
  
  array <- extractedData$array
  greenControls <- extractedData$greenControls
  redControls <- extractedData$redControls
  controlNames <- names(greenControls)
  ctrlsList <- extractedData$ctrlsList
  
  ## Bisulfite conversion extraction for probe type II:
  index <- match("BISULFITE CONVERSION II", controlNames)
  redControls.current <- redControls[[ index ]]
  bisulfite2 <- colMeans2(redControls.current, na.rm = TRUE)
  
  ## Bisulfite conversion extraction for probe type I:
  index <- match("BISULFITE CONVERSION I", controlNames)
  if (array=="IlluminaHumanMethylation450k"){
    addr <- getCtrlsAddr(exType = sprintf("BS Conversion I%sC%s", c(" ", "-", "-"), 1:3), index = index)
  } else {
    addr <- getCtrlsAddr(exType = sprintf("BS Conversion I%sC%s", c("-", "-"), 1:2), index = index)
  }
  greenControls.current <- greenControls[[ index ]][addr,,drop=FALSE]
  if (array=="IlluminaHumanMethylation450k"){
    addr <- getCtrlsAddr(exType = sprintf("BS Conversion I-C%s", 4:6), index = index)
  } else {
    addr <- getCtrlsAddr(exType = sprintf("BS Conversion I-C%s", 3:5), index = index)
  }
  redControls.current <- redControls[[ index ]][addr,, drop=FALSE]
  if (nrow(redControls.current)==nrow(greenControls.current)){
    bisulfite1 <- colMeans2(redControls.current + greenControls.current, na.rm = TRUE)
  } else {
    bisulfite1 <- colMeans2(redControls.current, na.rm=TRUE) + colMeans2(greenControls.current, na.rm = TRUE)
  }
  
  
  ## Staining
  index <- match("STAINING", controlNames)
  addr <- getCtrlsAddr(exType = "Biotin (High)", index = index)
  stain.green <- t(greenControls[[ index ]][addr,,drop=FALSE])
  addr <- getCtrlsAddr(exType = "DNP (High)", index = index)
  stain.red <- t(redControls[[ index ]][addr,, drop=FALSE ])
  
  ## Extension
  index <-    match("EXTENSION", controlNames)
  addr <- getCtrlsAddr(exType = sprintf("Extension (%s)", c("A", "T")), index = index)
  extension.red <- t(redControls[[index]][addr,,drop=FALSE])
  colnames(extension.red) <- paste0("extRed", 1:ncol(extension.red))
  addr <- getCtrlsAddr(exType = sprintf("Extension (%s)", c("C", "G")), index = index)
  extension.green <- t(greenControls[[index]][addr,,drop=FALSE])
  colnames(extension.green) <- paste0("extGrn", 1:ncol(extension.green))
  
  ## Hybridization should be monitored only in the green channel
  index <- match("HYBRIDIZATION", controlNames)
  hybe <- t(greenControls[[index]])
  colnames(hybe) <- paste0("hybe", 1:ncol(hybe))
  
  ## Target removal should be low compared to hybridization probes
  index <- match("TARGET REMOVAL", controlNames)
  targetrem <- t(greenControls[[index]])
  colnames(targetrem) <- paste0("targetrem", 1:ncol(targetrem))
  
  ## Non-polymorphic probes
  index <- match("NON-POLYMORPHIC", controlNames)
  addr <- getCtrlsAddr(exType = sprintf("NP (%s)", c("A", "T")), index = index)
  nonpoly.red <- t(redControls[[index]][addr, ,drop=FALSE])
  colnames(nonpoly.red) <- paste0("nonpolyRed", 1:ncol(nonpoly.red))
  addr <- getCtrlsAddr(exType = sprintf("NP (%s)", c("C", "G")), index = index)
  nonpoly.green <- t(greenControls[[index]][addr, ,drop=FALSE])
  colnames(nonpoly.green) <- paste0("nonpolyGrn", 1:ncol(nonpoly.green))
  
  ## Specificity II
  index <- match("SPECIFICITY II", controlNames)
  greenControls.current <- greenControls[[index]]
  redControls.current <- redControls[[index]]
  spec2.green <- t(greenControls.current)
  colnames(spec2.green) <- paste0("spec2Grn", 1:ncol(spec2.green))
  spec2.red <- t(redControls.current)
  colnames(spec2.red) <- paste0("spec2Red", 1:ncol(spec2.red))
  spec2.ratio <- colMeans2(greenControls.current, na.rm = TRUE) /
    colMeans2(redControls.current, na.rm = TRUE)
  
  ## Specificity I
  index <- match("SPECIFICITY I", controlNames)
  addr <- getCtrlsAddr(exType = sprintf("GT Mismatch %s (PM)", 1:3), index = index)
  greenControls.current <- greenControls[[index]][addr,,drop=FALSE]
  redControls.current <- redControls[[index]][addr,,drop=FALSE]
  spec1.green <- t(greenControls.current)
  colnames(spec1.green) <- paste0("spec1Grn", 1:ncol(spec1.green))
  spec1.ratio1 <- colMeans2(redControls.current, na.rm = TRUE) /
    colMeans2(greenControls.current, na.rm = TRUE)
  
  index <- match("SPECIFICITY I", controlNames) # Added that line
  addr <- getCtrlsAddr(exType = sprintf("GT Mismatch %s (PM)", 4:6), index = index)
  greenControls.current <- greenControls[[index]][addr,,drop=FALSE]
  redControls.current <- redControls[[index]][addr,,drop=FALSE]
  spec1.red <- t(redControls.current)
  colnames(spec1.red) <- paste0("spec1Red", 1:ncol(spec1.red))
  spec1.ratio2 <- colMeans2(greenControls.current, na.rm = TRUE) /
    colMeans2(redControls.current, na.rm = TRUE)
  spec1.ratio <- (spec1.ratio1 + spec1.ratio2) / 2
  
  ## Normalization probes:
  index <- match(c("NORM_A"), controlNames)
  normA <- colMeans2(redControls[[index]], na.rm = TRUE)
  index <- match(c("NORM_T"), controlNames)
  normT <- colMeans2(redControls[[index]], na.rm = TRUE)
  index <- match(c("NORM_C"), controlNames)
  normC <- colMeans2(greenControls[[index]], na.rm = TRUE)
  index <- match(c("NORM_G"), controlNames)
  normG <- colMeans2(greenControls[[index]], na.rm = TRUE)
  
  dyebias <- (normC + normG)/(normA + normT)
  
  oobG <- extractedData$oob$greenOOB
  oobR <- extractedData$oob$redOOB
  oob.ratio <- oobG[2,]/oobR[2,]
  oobG <- t(oobG)
  colnames(oobG) <- paste0("oob", c(1,50,99))
  
  model.matrix <- cbind(
    bisulfite1, bisulfite2, extension.green, extension.red, hybe,
    stain.green, stain.red, nonpoly.green, nonpoly.red,
    targetrem, spec1.green, spec1.red, spec2.green, spec2.red, spec1.ratio1,
    spec1.ratio, spec2.ratio, spec1.ratio2, normA, normC, normT, normG, dyebias,
    oobG, oob.ratio)
  
  
  ## Imputation
  for (colindex in 1:ncol(model.matrix)) {
    if(any(is.na(model.matrix[,colindex]))) {
      column <- model.matrix[,colindex]
      column[is.na(column)]    <- mean(column, na.rm = TRUE)
      model.matrix[ , colindex] <- column
    }
  }
  
  ## Scaling
  model.matrix <- scale(model.matrix)
  
  ## Fixing outliers
  model.matrix[model.matrix > 3] <- 3
  model.matrix[model.matrix < (-3)] <- -3
  
  ## Rescaling
  model.matrix <- scale(model.matrix)
  
  return(model.matrix)
}

# The old quantiles in a bivariate manner using our definition
# of a bivariate quantile. We use the Frank copula with C(u,v)
# where v =u in our case.
.bivariate.quantiles.frank.final = function(Meth, Unmeth, alpha){
  n = ncol(Meth)
  Meth.sort = apply(Meth, 2, sort)
  Unmeth.sort = apply(Unmeth, 2, sort)
  bivariate.list = c()
  
  for (i in 1:n){
    m = length(Meth[,i])
    tau = cor.fk(Meth[,i],Unmeth[,i])
    theta = BiCopTau2Par(family = 5, tau = tau)
    roots = (log(1 - sign(theta)* sqrt(-exp(-theta) 
                                       - exp(-theta*alpha) 
                                       + exp(-theta*(alpha+1)) + 1))) / -theta
    quants.meth = Meth.sort[,i][m*roots]
    quants.unmeth = Unmeth.sort[,i][m*roots]
    b.quants = cbind(quants.meth,quants.unmeth)
    colnames(b.quants) = c(paste(colnames(Meth)[i], "Meth" ,sep = '.'), 
                           paste(colnames(Unmeth)[i], "Unmeth", sep = '.'))
    bivariate.list = cbind(bivariate.list, b.quants)
  }
  row.names(bivariate.list) = as.character(alpha)
  return(bivariate.list)
}



# Return the normalized quantiles in a bivariate manner
# for the autosomes
.returnNewQuantiles = function(controlMatrix, quantiles, ncomp){
  n =  ncol(quantiles)
  quantiles = rbind(numeric(n),quantiles, quantiles[nrow(quantiles),] + 1000)
  rownames(quantiles)[1] = '0.0'; rownames(quantiles)[500] = '1.0'
  
  newQuantiles = array(0L, dim(quantiles))
  #()
  for (i in 2:nrow(quantiles)){
    #()
    quant = matrix(quantiles[i,], n/2,2, byrow = T)
    #()
    meanFunction = colMeans2(quant)
    res = sweep(quant,2,meanFunction)
    fit = plsr(res ~ controlMatrix,ncomp = ncomp)
    newfit = sweep(fit$residuals[, , paste(as.character(ncomp), 'comps')],2,-meanFunction)
    newQuantiles[i,] = as.vector(newfit)
  }
  newQuantiles = .regularizeQuantiles(newQuantiles)
  
  #Putting row and column names for convenience
  nam = c()
  nam = append(nam,colnames(quantiles)[seq(1,n-1,2)])
  nam = append(nam,colnames(quantiles)[seq(2,n,2)])
  colnames(newQuantiles) = nam
  row.names(newQuantiles) = row.names(quantiles)
  return(newQuantiles)
}

### Return the normalized quantile functions in a bivariate manner for sex
.sortbysex = function(sex, levels, newQuantiles1, newQuantiles2){
  #note that newQuantiles1 is for female in this example
  #and newQuantiles2 is for male in this example
  n = ncol(newQuantiles1)/2
  m = ncol(newQuantiles2)/2
  l = length(sex)
  # m+n = l
  newQuantiles1.meth = newQuantiles1[,c(1:n)]
  newQuantiles1.unmeth = newQuantiles1[,c((n+1):(2*n))]
  newQuantiles2.meth = newQuantiles2[,c(1:m)]
  newQuantiles2.unmeth = newQuantiles2[,c((m+1):(2*m))]
  meth.sort =  matrix(0,nrow=500,ncol=(n+m))
  unmeth.sort = matrix(0,nrow=500,ncol=(n+m))
  #()
  for (i in 1:l){
    if (sex[i] == levels[1]){
      meth.sort[,i] <- newQuantiles1.meth[,1]
      unmeth.sort[,i] <- newQuantiles1.unmeth[,1]
      newQuantiles1.meth = as.matrix(newQuantiles1.meth[,-1])
      newQuantiles1.unmeth = as.matrix(newQuantiles1.unmeth[,-1])
      #()
    } else{
      meth.sort[,i] <- newQuantiles2.meth[,1]
      unmeth.sort[,i] <- newQuantiles2.unmeth[,1]
      newQuantiles2.meth = as.matrix(newQuantiles2.meth[,-1])
      newQuantiles2.unmeth = as.matrix(newQuantiles2.unmeth[,-1])
      #()
    }
  }
  fulllist = cbind(meth.sort,unmeth.sort)
  return(fulllist)
}

# Helper function for .returnFitBySex.bivar
# Needed when normalization should be done separately by sex
.sex.bivariate = function(sex){
  sex.biv = c()
  for (i in 1:length(sex)){
    sex.val = rep(sex[i],2)
    sex.biv = append(sex.biv,sex.val)
  }
  return(sex.biv)
}

.returnFitBySex.bivar <- function(controlMatrix, quantiles, ncomp, sex) {
  stopifnot(is.matrix(quantiles))
  stopifnot(is.matrix(controlMatrix))
  sex    <- as.character(sex)
  levels <- unique(sex)
  nSexes <- length(levels)
  if (nSexes == 2) {
    sex1 <- sum(sex == levels[1])
    sex2 <- sum(sex == levels[2])
    
  } else {
    sex1 <- sum(sex == levels[1])
    sex2 <- 0
  }
  
  ## When normalization should not be performed by sex separately:
  if ((sex1 <= 10) | (sex2 <= 10)) {
    newQuantiles <- .returnNewQuantiles(controlMatrix = controlMatrix,
                                        quantiles = quantiles,
                                        ncomp = ncomp)
  } else {
    sex.bivar = .sex.bivariate(sex=sex)
    
    quantiles1 <- quantiles[, sex.bivar == levels[1]]
    controlMatrix1 <- controlMatrix[sex == levels[1], ]
    newQuantiles1 <- .returnNewQuantiles(controlMatrix = controlMatrix1,
                                         quantiles = quantiles1,
                                         ncomp = ncomp)
    
    quantiles2 <- quantiles[, sex.bivar == levels[2]]
    controlMatrix2 <- controlMatrix[sex == levels[2], ]
    newQuantiles2 <- .returnNewQuantiles(controlMatrix = controlMatrix2,
                                         quantiles = quantiles2,
                                         ncomp = ncomp)
    
    newQuantiles = .sortbysex(sex = sex, levels=levels, 
                              newQuantiles1=newQuantiles1, 
                              newQuantiles2=newQuantiles2)
  }
  
  return(newQuantiles)
}

# Normalize a matrix of intensities via linear interpolation
# between the quantile values and
# quantile normalization using this new distribution as target/reference 
# distribution.
# This then makes the new methylated and unmethylated values come
# from this distribution
.normalizeMatrix <- function(intMatrix, newQuantiles) {
  n <- nrow(newQuantiles)
  normMatrix <- sapply(1:ncol(intMatrix), function(i) {
    crtColumn <- intMatrix[ , i]
    crtColumn.reduced <- crtColumn[!is.na(crtColumn)]
    ## Generation of the corrected intensities:
    target <- sapply(1:(n-1), function(j) {
      start <- newQuantiles[j,i]
      end <- newQuantiles[j+1,i]
      if (!isTRUE(all.equal(start,end))){
        sequence <- seq(start, end,( end-start)/n)[-(n+1)]
      } else {
        sequence <- rep(start, n)
      }
      return(sequence)
    })
    target <- as.vector(target)
    result <- preprocessCore::normalize.quantiles.use.target(matrix(crtColumn.reduced), target)
    return(result)
  })
  return(normMatrix)
}

# To ensure a monotonically increasing and non-negative quantile function
.regularizeQuantiles <- function(x){
  x[x<0] <- 0
  colCummaxs(x)
}

# Some other helper functions obtained fromt the utilities in minfi
# so to make sure the data we work with is the correct object class
.isMatrixBackedOrStop <- function(object, FUN) {
  if (!.isMatrixBacked(object)) {
    stop("'", FUN, "()' only supports matrix-backed minfi objects.",
         call. = FALSE)
  }
}

.isMatrixBacked <- function(object) {
  stopifnot(is(object, "SummarizedExperiment"))
  all(vapply(assays(object), is.matrix, logical(1L)))
}

.isRGOrStop <- function(object) {
  if (!is(object, "RGChannelSet")) {
    stop("object is of class '", class(object), "', but needs to be of ",
         "class 'RGChannelSet' or 'RGChannelSetExtended'")
  }
}

.isGenomicOrStop <- function(object) {
  if (!is(object, "GenomicMethylSet") && !is(object, "GenomicRatioSet")) {
    stop("object is of class '", class(object), "', but needs to be of ",
         "class 'GenomicMethylSet' or 'GenomicRatioSet'")
  }
}






#---- Chapter 4: Second Methodology ----

# Need minfi package for supplementary functions for 450K data.
# Need pcaPP pacakge for faster computation of Kendall's Tau.
# Need VineCopula package to calculate parameter of copula from 
# Kendall's Tau.
# Need pls package for efficient partial least squares computation.
# Need bivariate package for efficient computation of joint probabilities
library(minfi)
library(VineCopula)
library(pcaPP)
library(bivariate)
library(pls)

#---- Functions ---
# Getting the probabilities that stem from Methylated values for faster 
# computation
getmeth.probs = function(Meth){
  n = ncol(Meth)
  Fhat = vector("list",n)
  Fhat1 = matrix(NA, nrow=nrow(Meth), ncol=n)
  for (i in 1:n) {
    Fhat[[i]] = ecdf(Meth[,i])
  }
  for (i in 1:n) {
    Fhat1[,i] = Fhat[[i]](Meth[,i])
  }
  return(Fhat1)
}

# This function retrieves the joint probabilities for each probe
# for each sample.
get.bivariate.probabilities = function(Meth, Unmeth){
  n = ncol(Meth)
  Fhat = vector("list",n)
  Fhat1 = matrix(NA, nrow=nrow(Meth), ncol=n)
  for (i in 1:n) {
    Fhat[[i]] = ebvcdf(Meth[,i],Unmeth[,i])
  }
  for (i in 1:n) {
    Fhat1[,i] = Fhat[[i]](Meth[,i], Unmeth[,i])
  }
  return(Fhat1)
}


# We now retrieve the probability points discussed in Chapter 4.2
# by letting v =1-u for the copula C(u,v)
bivariate.quantiles.frank.final.2 = function(Meth, Unmeth, alpha){
  n = ncol(Meth)
  Meth.sort = apply(Meth, 2, sort)
  Unmeth.sort = apply(Unmeth, 2, sort)
  bivariate.list = c()
  for (i in 1:n){
    m = length(Meth[,i])
    tau = cor.fk(Meth[,i],Unmeth[,i])
    theta = BiCopTau2Par(family = 5, tau = tau)
    
    a = exp(theta*(alpha - 1)) - exp(theta*alpha) + 1
    b = -2*exp(-theta)
    c = exp(-theta)
    
    roots = (log((-b + sqrt(b^2 -4*a*c))/(2*a))) / (-theta)
    roots2 = 1 - roots
    
    roots[roots == 0] <- 1/m
    roots2[roots2 == 0] <-1/m
    
    quants.meth = Meth.sort[,i][m*roots]
    quants.unmeth = Unmeth.sort[,i][m*roots2]
    
    b.quants = cbind(quants.meth,quants.unmeth)
    colnames(b.quants) = c(paste(colnames(Meth)[i], "Meth" ,sep = '.'), 
                           paste(colnames(Unmeth)[i], "Unmeth", sep = '.'))
    bivariate.list = cbind(bivariate.list, b.quants)
  }
  return(bivariate.list)
}

# For the methodology described in 4.2, we retrieve the minimums and maximums 
# instead of setting the minimums to 0 and the maximums to the second highest
# value + 1000
returnNewQuantiles2 = function(controlMatrix, quantiles, ncomp){
  
  n =  ncol(quantiles)
  newQuantiles = array(0L, dim(quantiles))
  
  for (i in 1:nrow(quantiles)){
    quant = matrix(quantiles[i,], n/2,2, byrow = T)
    meanFunction = colMeans2(quant)
    res = sweep(quant,2,meanFunction)
    fit = plsr(res ~ controlMatrix,ncomp = ncomp)
    newfit = sweep(fit$residuals[, , paste(as.character(ncomp), 'comps')],2,-meanFunction)
    newQuantiles[i,] = as.vector(newfit)
  }
  newQuantiles = .regularizeQuantiles(newQuantiles)
  #Putting row and column names for convenience, in case if you want to look
  nam = c()
  nam = append(nam,colnames(quantiles)[seq(1,n-1,2)])
  nam = append(nam,colnames(quantiles)[seq(2,n,2)])
  colnames(newQuantiles) = nam
  row.names(newQuantiles) = row.names(quantiles)
  return(newQuantiles)
}

# We do this regularization in this way since the values in these 
# bivariate probability points are bidirectional, ie when 
# one value increases, the other value decreases
.regularizeQuantiles <- function(x){
  n = ncol(x) /2
  x[x<0] <- 0
  x = cbind(colCummaxs(x[,1:n]),
            colCummins(x[,(n+1):(2*n)]))
}

# We now take get the adjustments that were done to the quantiles
# by the PLS regression and store them
adjustment = function(oldQuantiles, newQuantiles){
  n = ncol(oldQuantiles)
  old.meth = oldQuantiles[,seq(1,(n - 1),by=2)]
  old.unmeth = oldQuantiles[,seq(2,n,by=2)]
  #browser()
  oldquants = cbind(old.meth,old.unmeth)
  adjust = newQuantiles - oldquants
  return(adjust)
}

# The adjustment process described in the first and
# second Variation Methodologies
normalize.bivariate = function(Meth, Unmeth, joint.ecdf, 
                               quantiles.edf, adjustments){
  n = ncol(Meth)
  m = nrow(Meth)
  s = dim(quantiles.edf)[1]
  
  adjust.meth = adjustments[,1:n]
  adjust.unmeth = adjustments[,(n+1):(2*n)]
  
  Meth.new = matrix(NA, nrow=m,  ncol = n)
  Unmeth.new =  matrix(NA, nrow=m,  ncol = n)
  for (i in 1:n){
    for (j in 1:(s-1)){
      
      f2 = quantiles.edf[(j+1),i]
      f1 = quantiles.edf[j,i]
      
      indices = which( f1<=joint.ecdf[,i] & joint.ecdf[,i] < f2)
      fhat = joint.ecdf[indices, i]
      y = cbind(Meth[indices,i],Unmeth[indices,i])
      
      adjust.k2 = cbind(adjust.meth[(j+1),i], adjust.unmeth[(j+1),i])
      adjust.k1 = cbind(adjust.meth[j,i], adjust.unmeth[j,i])
      
      weight = (fhat - f1) / (f2 - f1)
      diff.k = adjust.k2 - adjust.k1
      
      adjust.y = cbind(adjust.k1[1] + weight*diff.k[1], adjust.k1[2] + weight*diff.k[2])
      yhat = y + adjust.y
      
      # Methylated and Unmethylated values must be >= 0
      yhat[yhat<0] <-0
      Meth.new[indices, i] = yhat[,1]
      Unmeth.new[indices, i] = yhat[,2]
    }
  }
  return(cbind(Meth.new,Unmeth.new))
}

# ---- Applying the methodology to the Mania dataset ----

# Retrieving the dataset from where is was stored
RGSet <- read.metharray.exp('D:/methyldatamedium/methyldatamedium')

# Getting the phenotypes of the samples (whether a patient has Mania or not)
type.mania = c()
for (i in seq(1,80, by =2)){
  l = as.character(type_mania[i,1])
  type.mania = append(type.mania, l)
}

# We follow the steps done from FunNorm to the data prior
# to normalization. We use the functions from the Chapter 3 
# methodology to get what we need
gmSet <- preprocessNoob(RGSet, dyeCorr = T)
gmSet <- mapToGenome(gmSet)

extractedData = .extractFromRGSet450k(RGSet)
mod.mat = .buildControlMatrix450k(extractedData)

indices = .getFunnormIndices(gmSet)
Meth.cor = getMeth(gmSet)
Unmeth.cor = getUnmeth(gmSet)

probs <- seq(from = 0, to = 1, length.out = 501)



#---- Type I Green probes ----
mat1.meth = Meth.cor[indices[[1]],,drop=FALSE]
mat1.unmeth = Unmeth.cor[indices[[1]],,drop=FALSE]

Meth.green.probs = getmeth.probs(mat1.meth)

system.time({joint.edf.green = 
  get.bivariate.probabilities(mat1.meth,mat1.unmeth)})
# user    system   elapsed 
# 1017.23   2.57  1044.25 


# Using the law of total probability P(a) = P(a & b) + P(a & b^c) to get 
# P(Meth<M,Unmeth>U)
joint.prob.green.2 = Meth.green.probs - joint.edf.green


# Probability points stemming from v = 1-u and the corresponding adjustments 
quants.green.2 = bivariate.quantiles.frank.final.2(mat1.meth,
                                                    mat1.unmeth,
                                                    probs)
newQuantiles.green.2 = returnNewQuantiles2(controlMatrix = mod.mat,
                                            quantiles = quants.green.2,
                                            ncomp = 3)

edf.quantiles.green.2 = joint.edf.of.quantiles.2(mat1.meth,mat1.unmeth,
                                                  quants.green.2)

adjust.green.2 = adjustment(oldQuantiles = quants.green.2,
                             newQuantiles = newQuantiles.green.2)

#Second Variation Normalization
fit1.green = normalize.bivariate(mat1.meth,mat1.unmeth,
                                      joint.ecdf = joint.prob.green.2,
                                      quantiles.edf = edf.quantiles.green.2,
                                      adjustments = adjust.green.2)

# Getting the Methylated and Unmethylated signals
fitting.green.meth = fitting.green[,1:40]
fitting.green.unmeth = fitting.green[,41:80]

# beta-values
fit.beta.green = fitting.green.meth / (fitting.green.meth + fitting.green.unmeth +100)

#plot
densityPlot(fit.beta.green, sampGroups = type.mania)

#---- Type I Red Probes ---- 
mat2.meth = Meth.cor[indices[[2]],,drop=FALSE]
mat2.unmeth = Unmeth.cor[indices[[2]],,drop=FALSE]

Meth.red.probs = .getmeth.probs(mat2.meth)
system.time({joint.prob.red.1 = 
  .get.bivariate.probabilities(mat2.meth,mat2.unmeth)})
#user  system elapsed 
#3755.82  157.16 3977.30 

# Using the law of total probability P(a) = P(a & b) + P(a & b^c) to get 
# P(Meth<M,Unmeth>U)
joint.prob.red.2 = Meth.red.probs - joint.prob.red.1


# Probability points stemming from v = 1-u and the corresponding adjustments 
quants.red.2 = .bivariate.quantiles.frank.final.2(mat2.meth,
                                                  mat2.unmeth,
                                                  probs)

newQuantiles.red.2 = .returnNewQuantiles2(controlMatrix = mod.mat,
                                          quantiles = quants.red.2,
                                          ncomp = 3)

edf.quantiles.red.2 = .joint.edf.of.quantiles.2(mat2.meth,mat2.unmeth,
                                                quants.red.2)

adjust.red.2 = .adjustment(oldQuantiles = quants.red.2,
                           newQuantiles = newQuantiles.red.2)

# Second Variation Normalization
fit1.red = normalize.bivariate(mat2.meth,mat2.unmeth,
                                    joint.ecdf = joint.prob.red.2,
                                    quantiles.edf = edf.quantiles.red.2,
                                    adjustments = adjust.red.2)

# Methylated and Unmethylated values
fitting.red.meth = fit1.red[,1:40]
fitting.red.unmeth = fit1.red[,41:80]

# Beta-values
fit.beta.red = fitting.red.meth / (fitting.red.meth + fitting.red.unmeth +100)
#plot
densityPlot(fit.beta.red, sampGroups = type.mania)