#----Supplementary Work for the Thesis of Clifford Yacas ----- #----in fullfillment for M.Sc in Statistics---- #----Copyrighted 2020--- #---- Chapter 3 Methodology ---- #---- creating a function similar to 'preprocessFunNorm' ---- # need minfi package for supplementary functions for 450K data # need pcaPP package for faster computation of Kendall's Tau # need VineCopula package to calculate parameter of copula from # Kendall's Tau # need pls package for efficient partial least squares computation library(minfi) library(pcaPP) library(copula) library(VineCopula) library(pls) # Main Function # We note that many of the functions are either directly # from the original FunNorm function. # We also note that the functions we've created are conceptually based # on original FunNorm but have been changed in a way to implement # our ideas in Chapter 3. preprocessFunnorm.bivar <- function(rgSet, ncomp=3, sex = NULL, bgCorr = TRUE, dyeCorr = TRUE, keepCN = TRUE, ratioConvert = TRUE, verbose = TRUE) { # makes sure that object is minfi-backed .isMatrixBackedOrStop(rgSet, "preprocessFunnorm.bivar") # makes sure that the object is'RGChannelSet' or 'RGChannelSetExtended' .isRGOrStop(rgSet) rgSet <- updateObject(rgSet) # Background correction and dye bias normalization via noob # (preprocessNoob() function): if (bgCorr){ if(verbose && dyeCorr) { message("[preprocessFunnorm.bivar] Background and dye bias correction with noob") } else { message("[preprocessFunnorm.bivar] Background correction with noob") } gmSet <- preprocessNoob(rgSet, dyeCorr = dyeCorr) if(verbose) message("[preprocessFunnorm.bivar] Mapping to genome") gmSet <- mapToGenome(gmSet) } else { if(verbose) message("[preprocessFunnorm.bivar] Mapping to genome") gmSet <- mapToGenome(rgSet) } # tells the user if the inner function should verbose subverbose <- max(as.integer(verbose) - 1L, 0) if(verbose) message("[preprocessFunnorm.bivar] Quantile extraction") # extraction of control probes from data as described by # Fortin et al. (2014) extractedData <- .extractFromRGSet450k(rgSet) rm(rgSet) # adds sex vector if not specified before if (is.null(sex)) { gmSet <- addSex(gmSet, getSex(gmSet, cutoff = -3)) sex <- rep(1L, length(gmSet$predictedSex)) sex[gmSet$predictedSex == "F"] <- 2L } if(verbose) message("[preprocessFunnorm.bivar] Normalization") # Gets A matrix of copy number values of the data # Copy number variation is a phenomenon in which # sections of the genome are repeated and the number # of repeats in the genome varies between individuals. # Needed to convert data to ratio data. In our case, # to Beta-values if(keepCN) { CN <- getCN(gmSet) } # The main function that does our quantile normalization process # specified in the thesis in Chapter 3 gmSet <- .normalizeFunnorm450k.bivar(object = gmSet, extractedData = extractedData, ncomp = ncomp, sex=sex, verbose = subverbose) # Stores preprocess method used. preprocessMethod <- c(preprocessMethod(gmSet), mu.norm = sprintf("Funnorm.bivar, ncomp=%s", ncomp)) # Gets Beta-values if specified if(ratioConvert) { grSet <- ratioConvert(gmSet, type = "Illumina", keepCN = keepCN) if(keepCN) { assay(grSet, "CN") <- CN } grSet@preprocessMethod <- preprocessMethod return(grSet) } else { gmSet@preprocessMethod <- preprocessMethod return(gmSet) } } ##### Functions and Helper Functions ##### # Gets the indices of the probes belonging in either # Type I Green, Type I Red, Type II and Sex chromosomes .getFunnormIndices <- function(object) { .isGenomicOrStop(object) probeType <- getProbeType(object, withColor = TRUE) autosomal <- (seqnames(object) %in% paste0("chr", 1:22)) indices <- list(IGrn = which(probeType == "IGrn" && autosomal), IRed = which(probeType == "IRed" && autosomal), II = which(probeType == "II" && autosomal), X = which(seqnames(object) == "chrX"), Y = which(seqnames(object) == "chrY")) indices } # The main function that does our normalization process # specified in Chapter 3 .normalizeFunnorm450k.bivar <- function(object, extractedData, ncomp, sex, verbose = FALSE) { normalizeQuantiles.bivar <- function(Meth, Unmeth, indices, sex = NULL) { # Get the methylated and unmethylated probe signals via # probe type Meth1 <- Meth[indices,,drop=FALSE] Unmeth1 <- Unmeth[indices,,drop=FALSE] # The old bivariate quantiles of the data as specified in the thesis. # We use the Frank copula and use v = u for the copula C(u,v). oldQuantiles <- .bivariate.quantiles.frank.final(Meth = Meth1, Unmeth = Unmeth1, alpha = probs) # regressing out unwanted variation by PLS regression # for either the autosomes or sex chromosomes if(is.null(sex)) { newQuantiles <- .returnNewQuantiles(controlMatrix = model.matrix, quantiles = oldQuantiles, ncomp = ncomp) } else { newQuantiles <- .returnFitBySex.bivar(controlMatrix = model.matrix, quantiles = oldQuantiles, ncomp = ncomp, sex = sex) } # This function was from the original FunNorm # where linear interpolation will be done between the quantiles # after regressing out the variation. .normalizeMatrix(cbind(Meth1,Unmeth1), newQuantiles) } indicesList <- .getFunnormIndices(object) # The control matrix is built from the extracted data # by using the summary statistics defined in Fortin et al. (2014) # for the control probes model.matrix <- .buildControlMatrix450k(extractedData) # Remove the first and last quantile level since we set the 1st quantile # to be 0 and the last quantile to be the 499th quantile value + 1000 probs <- seq(from = 0, to = 1, length.out = 500)[c(-1,-500)] Meth <- getMeth(object) Unmeth <- getUnmeth(object) Combined <- cbind(Meth,Unmeth) n = ncol(Meth) if (ncomp > 0){ # autosomes normalization for (type in c("IGrn", "IRed", "II")) { indices <- indicesList[[type]] if(length(indices) > 0) { if(verbose) message(sprintf("[normalizeFunnorm450k.bivar] Normalization of the %s probes", type)) Combined[indices,] <- normalizeQuantiles.bivar(Meth = Meth, Unmeth = Unmeth, indices = indices, sex = NULL) } } # X-chrom normalization indices <- indicesList[["X"]] if(length(indices) > 0) { if(verbose) message("[normalizeFunnorm450k.bivar] Normalization of the X-chromosome") Combined[indices,] <- normalizeQuantiles.bivar(Meth =Meth, Unmeth = Unmeth, indices = indices, sex = sex) #() } } #() Meth = Combined[,c(1:n)] Unmeth = Combined[,c((n+1):(2*n))] # Y-chrom normalization # For Y-chrom, we use quantile normalization defined by Bolstad et al. (2003) # This method is in the preprocessCore package but is imported with minfi indices <- indicesList[["Y"]] if(length(indices) > 0) { if(verbose) message("[normalizeFunnorm450k.bivar] Normalization of the Y-chromosome") sex <- as.character(sex) levels <- unique(sex) nSexes <- length(levels) #() if (nSexes == 2) { level1 <- levels[1] level2 <- levels[2] } if (nSexes == 2) { if (sum(sex == level1)>1) { Meth[indices, sex==level1] <- preprocessCore::normalize.quantiles(Meth[indices, sex == level1, drop=FALSE]) Unmeth[indices, sex==level1] <- preprocessCore::normalize.quantiles(Unmeth[indices, sex == level1,drop=FALSE]) } if (sum(sex == level2)>1) { Meth[indices, sex==level2] <- preprocessCore::normalize.quantiles(Meth[indices, sex == level2,drop=FALSE]) Unmeth[indices, sex==level2] <- preprocessCore::normalize.quantiles(Unmeth[indices, sex == level2,drop=FALSE]) } } else { Meth[indices,] <- preprocessCore::normalize.quantiles(Meth[indices,]) Unmeth[indices,] <- preprocessCore::normalize.quantiles(Unmeth[indices,]) } } assay(object, "Meth") <- Meth assay(object, "Unmeth") <- Unmeth return(object) } ### To extract quantiles and control probes from rgSet .extractFromRGSet450k <- function(rgSet) { rgSet <- updateObject(rgSet) controlType <- c("BISULFITE CONVERSION I", "BISULFITE CONVERSION II", "EXTENSION", "HYBRIDIZATION", "NEGATIVE", "NON-POLYMORPHIC", "NORM_A", "NORM_C", "NORM_G", "NORM_T", "SPECIFICITY I", "SPECIFICITY II", "TARGET REMOVAL", "STAINING") #() array <- annotation(rgSet)[["array"]] #() ## controlAddr <- getControlAddress(rgSet, controlType = controlType, asList = TRUE) ctrls <- getProbeInfo(rgSet, type = "Control") #() if(!all(controlType %in% ctrls$Type)) stop("The `rgSet` does not contain all necessary control probes") ctrlsList <- split(ctrls, ctrls$Type)[controlType] #() redControls <- getRed(rgSet)[ctrls$Address,,drop=FALSE] #() redControls <- lapply(ctrlsList, function(ctl) redControls[ctl$Address,,drop=FALSE]) #() greenControls <- getGreen(rgSet)[ctrls$Address,,drop=FALSE] #() greenControls <- lapply(ctrlsList, function(ctl) greenControls[ctl$Address,,drop=FALSE]) #() ## Extraction of the undefined negative control probes oobRaw <- getOOB(rgSet) probs <- c(0.01, 0.50, 0.99) greenOOB <- t(colQuantiles(oobRaw$Grn, na.rm = TRUE, probs = probs)) redOOB <- t(colQuantiles(oobRaw$Red, na.rm=TRUE, probs = probs)) oob <- list(greenOOB = greenOOB, redOOB = redOOB) return(list( greenControls = greenControls, redControls = redControls, oob = oob, ctrlsList = ctrlsList, array = array)) } ## Extraction of the Control matrix from the control probes in extracted data ## These are the summary statistics of the control probes ## as specified in Fortin et al. (2014) .buildControlMatrix450k <- function(extractedData) { getCtrlsAddr <- function(exType, index) { ctrls <- ctrlsList[[index]] addr <- ctrls$Address names(addr) <- ctrls$ExtendedType na.omit(addr[exType]) } array <- extractedData$array greenControls <- extractedData$greenControls redControls <- extractedData$redControls controlNames <- names(greenControls) ctrlsList <- extractedData$ctrlsList ## Bisulfite conversion extraction for probe type II: index <- match("BISULFITE CONVERSION II", controlNames) redControls.current <- redControls[[ index ]] bisulfite2 <- colMeans2(redControls.current, na.rm = TRUE) ## Bisulfite conversion extraction for probe type I: index <- match("BISULFITE CONVERSION I", controlNames) if (array=="IlluminaHumanMethylation450k"){ addr <- getCtrlsAddr(exType = sprintf("BS Conversion I%sC%s", c(" ", "-", "-"), 1:3), index = index) } else { addr <- getCtrlsAddr(exType = sprintf("BS Conversion I%sC%s", c("-", "-"), 1:2), index = index) } greenControls.current <- greenControls[[ index ]][addr,,drop=FALSE] if (array=="IlluminaHumanMethylation450k"){ addr <- getCtrlsAddr(exType = sprintf("BS Conversion I-C%s", 4:6), index = index) } else { addr <- getCtrlsAddr(exType = sprintf("BS Conversion I-C%s", 3:5), index = index) } redControls.current <- redControls[[ index ]][addr,, drop=FALSE] if (nrow(redControls.current)==nrow(greenControls.current)){ bisulfite1 <- colMeans2(redControls.current + greenControls.current, na.rm = TRUE) } else { bisulfite1 <- colMeans2(redControls.current, na.rm=TRUE) + colMeans2(greenControls.current, na.rm = TRUE) } ## Staining index <- match("STAINING", controlNames) addr <- getCtrlsAddr(exType = "Biotin (High)", index = index) stain.green <- t(greenControls[[ index ]][addr,,drop=FALSE]) addr <- getCtrlsAddr(exType = "DNP (High)", index = index) stain.red <- t(redControls[[ index ]][addr,, drop=FALSE ]) ## Extension index <- match("EXTENSION", controlNames) addr <- getCtrlsAddr(exType = sprintf("Extension (%s)", c("A", "T")), index = index) extension.red <- t(redControls[[index]][addr,,drop=FALSE]) colnames(extension.red) <- paste0("extRed", 1:ncol(extension.red)) addr <- getCtrlsAddr(exType = sprintf("Extension (%s)", c("C", "G")), index = index) extension.green <- t(greenControls[[index]][addr,,drop=FALSE]) colnames(extension.green) <- paste0("extGrn", 1:ncol(extension.green)) ## Hybridization should be monitored only in the green channel index <- match("HYBRIDIZATION", controlNames) hybe <- t(greenControls[[index]]) colnames(hybe) <- paste0("hybe", 1:ncol(hybe)) ## Target removal should be low compared to hybridization probes index <- match("TARGET REMOVAL", controlNames) targetrem <- t(greenControls[[index]]) colnames(targetrem) <- paste0("targetrem", 1:ncol(targetrem)) ## Non-polymorphic probes index <- match("NON-POLYMORPHIC", controlNames) addr <- getCtrlsAddr(exType = sprintf("NP (%s)", c("A", "T")), index = index) nonpoly.red <- t(redControls[[index]][addr, ,drop=FALSE]) colnames(nonpoly.red) <- paste0("nonpolyRed", 1:ncol(nonpoly.red)) addr <- getCtrlsAddr(exType = sprintf("NP (%s)", c("C", "G")), index = index) nonpoly.green <- t(greenControls[[index]][addr, ,drop=FALSE]) colnames(nonpoly.green) <- paste0("nonpolyGrn", 1:ncol(nonpoly.green)) ## Specificity II index <- match("SPECIFICITY II", controlNames) greenControls.current <- greenControls[[index]] redControls.current <- redControls[[index]] spec2.green <- t(greenControls.current) colnames(spec2.green) <- paste0("spec2Grn", 1:ncol(spec2.green)) spec2.red <- t(redControls.current) colnames(spec2.red) <- paste0("spec2Red", 1:ncol(spec2.red)) spec2.ratio <- colMeans2(greenControls.current, na.rm = TRUE) / colMeans2(redControls.current, na.rm = TRUE) ## Specificity I index <- match("SPECIFICITY I", controlNames) addr <- getCtrlsAddr(exType = sprintf("GT Mismatch %s (PM)", 1:3), index = index) greenControls.current <- greenControls[[index]][addr,,drop=FALSE] redControls.current <- redControls[[index]][addr,,drop=FALSE] spec1.green <- t(greenControls.current) colnames(spec1.green) <- paste0("spec1Grn", 1:ncol(spec1.green)) spec1.ratio1 <- colMeans2(redControls.current, na.rm = TRUE) / colMeans2(greenControls.current, na.rm = TRUE) index <- match("SPECIFICITY I", controlNames) # Added that line addr <- getCtrlsAddr(exType = sprintf("GT Mismatch %s (PM)", 4:6), index = index) greenControls.current <- greenControls[[index]][addr,,drop=FALSE] redControls.current <- redControls[[index]][addr,,drop=FALSE] spec1.red <- t(redControls.current) colnames(spec1.red) <- paste0("spec1Red", 1:ncol(spec1.red)) spec1.ratio2 <- colMeans2(greenControls.current, na.rm = TRUE) / colMeans2(redControls.current, na.rm = TRUE) spec1.ratio <- (spec1.ratio1 + spec1.ratio2) / 2 ## Normalization probes: index <- match(c("NORM_A"), controlNames) normA <- colMeans2(redControls[[index]], na.rm = TRUE) index <- match(c("NORM_T"), controlNames) normT <- colMeans2(redControls[[index]], na.rm = TRUE) index <- match(c("NORM_C"), controlNames) normC <- colMeans2(greenControls[[index]], na.rm = TRUE) index <- match(c("NORM_G"), controlNames) normG <- colMeans2(greenControls[[index]], na.rm = TRUE) dyebias <- (normC + normG)/(normA + normT) oobG <- extractedData$oob$greenOOB oobR <- extractedData$oob$redOOB oob.ratio <- oobG[2,]/oobR[2,] oobG <- t(oobG) colnames(oobG) <- paste0("oob", c(1,50,99)) model.matrix <- cbind( bisulfite1, bisulfite2, extension.green, extension.red, hybe, stain.green, stain.red, nonpoly.green, nonpoly.red, targetrem, spec1.green, spec1.red, spec2.green, spec2.red, spec1.ratio1, spec1.ratio, spec2.ratio, spec1.ratio2, normA, normC, normT, normG, dyebias, oobG, oob.ratio) ## Imputation for (colindex in 1:ncol(model.matrix)) { if(any(is.na(model.matrix[,colindex]))) { column <- model.matrix[,colindex] column[is.na(column)] <- mean(column, na.rm = TRUE) model.matrix[ , colindex] <- column } } ## Scaling model.matrix <- scale(model.matrix) ## Fixing outliers model.matrix[model.matrix > 3] <- 3 model.matrix[model.matrix < (-3)] <- -3 ## Rescaling model.matrix <- scale(model.matrix) return(model.matrix) } # The old quantiles in a bivariate manner using our definition # of a bivariate quantile. We use the Frank copula with C(u,v) # where v =u in our case. .bivariate.quantiles.frank.final = function(Meth, Unmeth, alpha){ n = ncol(Meth) Meth.sort = apply(Meth, 2, sort) Unmeth.sort = apply(Unmeth, 2, sort) bivariate.list = c() for (i in 1:n){ m = length(Meth[,i]) tau = cor.fk(Meth[,i],Unmeth[,i]) theta = BiCopTau2Par(family = 5, tau = tau) roots = (log(1 - sign(theta)* sqrt(-exp(-theta) - exp(-theta*alpha) + exp(-theta*(alpha+1)) + 1))) / -theta quants.meth = Meth.sort[,i][m*roots] quants.unmeth = Unmeth.sort[,i][m*roots] b.quants = cbind(quants.meth,quants.unmeth) colnames(b.quants) = c(paste(colnames(Meth)[i], "Meth" ,sep = '.'), paste(colnames(Unmeth)[i], "Unmeth", sep = '.')) bivariate.list = cbind(bivariate.list, b.quants) } row.names(bivariate.list) = as.character(alpha) return(bivariate.list) } # Return the normalized quantiles in a bivariate manner # for the autosomes .returnNewQuantiles = function(controlMatrix, quantiles, ncomp){ n = ncol(quantiles) quantiles = rbind(numeric(n),quantiles, quantiles[nrow(quantiles),] + 1000) rownames(quantiles)[1] = '0.0'; rownames(quantiles)[500] = '1.0' newQuantiles = array(0L, dim(quantiles)) #() for (i in 2:nrow(quantiles)){ #() quant = matrix(quantiles[i,], n/2,2, byrow = T) #() meanFunction = colMeans2(quant) res = sweep(quant,2,meanFunction) fit = plsr(res ~ controlMatrix,ncomp = ncomp) newfit = sweep(fit$residuals[, , paste(as.character(ncomp), 'comps')],2,-meanFunction) newQuantiles[i,] = as.vector(newfit) } newQuantiles = .regularizeQuantiles(newQuantiles) #Putting row and column names for convenience nam = c() nam = append(nam,colnames(quantiles)[seq(1,n-1,2)]) nam = append(nam,colnames(quantiles)[seq(2,n,2)]) colnames(newQuantiles) = nam row.names(newQuantiles) = row.names(quantiles) return(newQuantiles) } ### Return the normalized quantile functions in a bivariate manner for sex .sortbysex = function(sex, levels, newQuantiles1, newQuantiles2){ #note that newQuantiles1 is for female in this example #and newQuantiles2 is for male in this example n = ncol(newQuantiles1)/2 m = ncol(newQuantiles2)/2 l = length(sex) # m+n = l newQuantiles1.meth = newQuantiles1[,c(1:n)] newQuantiles1.unmeth = newQuantiles1[,c((n+1):(2*n))] newQuantiles2.meth = newQuantiles2[,c(1:m)] newQuantiles2.unmeth = newQuantiles2[,c((m+1):(2*m))] meth.sort = matrix(0,nrow=500,ncol=(n+m)) unmeth.sort = matrix(0,nrow=500,ncol=(n+m)) #() for (i in 1:l){ if (sex[i] == levels[1]){ meth.sort[,i] <- newQuantiles1.meth[,1] unmeth.sort[,i] <- newQuantiles1.unmeth[,1] newQuantiles1.meth = as.matrix(newQuantiles1.meth[,-1]) newQuantiles1.unmeth = as.matrix(newQuantiles1.unmeth[,-1]) #() } else{ meth.sort[,i] <- newQuantiles2.meth[,1] unmeth.sort[,i] <- newQuantiles2.unmeth[,1] newQuantiles2.meth = as.matrix(newQuantiles2.meth[,-1]) newQuantiles2.unmeth = as.matrix(newQuantiles2.unmeth[,-1]) #() } } fulllist = cbind(meth.sort,unmeth.sort) return(fulllist) } # Helper function for .returnFitBySex.bivar # Needed when normalization should be done separately by sex .sex.bivariate = function(sex){ sex.biv = c() for (i in 1:length(sex)){ sex.val = rep(sex[i],2) sex.biv = append(sex.biv,sex.val) } return(sex.biv) } .returnFitBySex.bivar <- function(controlMatrix, quantiles, ncomp, sex) { stopifnot(is.matrix(quantiles)) stopifnot(is.matrix(controlMatrix)) sex <- as.character(sex) levels <- unique(sex) nSexes <- length(levels) if (nSexes == 2) { sex1 <- sum(sex == levels[1]) sex2 <- sum(sex == levels[2]) } else { sex1 <- sum(sex == levels[1]) sex2 <- 0 } ## When normalization should not be performed by sex separately: if ((sex1 <= 10) | (sex2 <= 10)) { newQuantiles <- .returnNewQuantiles(controlMatrix = controlMatrix, quantiles = quantiles, ncomp = ncomp) } else { sex.bivar = .sex.bivariate(sex=sex) quantiles1 <- quantiles[, sex.bivar == levels[1]] controlMatrix1 <- controlMatrix[sex == levels[1], ] newQuantiles1 <- .returnNewQuantiles(controlMatrix = controlMatrix1, quantiles = quantiles1, ncomp = ncomp) quantiles2 <- quantiles[, sex.bivar == levels[2]] controlMatrix2 <- controlMatrix[sex == levels[2], ] newQuantiles2 <- .returnNewQuantiles(controlMatrix = controlMatrix2, quantiles = quantiles2, ncomp = ncomp) newQuantiles = .sortbysex(sex = sex, levels=levels, newQuantiles1=newQuantiles1, newQuantiles2=newQuantiles2) } return(newQuantiles) } # Normalize a matrix of intensities via linear interpolation # between the quantile values and # quantile normalization using this new distribution as target/reference # distribution. # This then makes the new methylated and unmethylated values come # from this distribution .normalizeMatrix <- function(intMatrix, newQuantiles) { n <- nrow(newQuantiles) normMatrix <- sapply(1:ncol(intMatrix), function(i) { crtColumn <- intMatrix[ , i] crtColumn.reduced <- crtColumn[!is.na(crtColumn)] ## Generation of the corrected intensities: target <- sapply(1:(n-1), function(j) { start <- newQuantiles[j,i] end <- newQuantiles[j+1,i] if (!isTRUE(all.equal(start,end))){ sequence <- seq(start, end,( end-start)/n)[-(n+1)] } else { sequence <- rep(start, n) } return(sequence) }) target <- as.vector(target) result <- preprocessCore::normalize.quantiles.use.target(matrix(crtColumn.reduced), target) return(result) }) return(normMatrix) } # To ensure a monotonically increasing and non-negative quantile function .regularizeQuantiles <- function(x){ x[x<0] <- 0 colCummaxs(x) } # Some other helper functions obtained fromt the utilities in minfi # so to make sure the data we work with is the correct object class .isMatrixBackedOrStop <- function(object, FUN) { if (!.isMatrixBacked(object)) { stop("'", FUN, "()' only supports matrix-backed minfi objects.", call. = FALSE) } } .isMatrixBacked <- function(object) { stopifnot(is(object, "SummarizedExperiment")) all(vapply(assays(object), is.matrix, logical(1L))) } .isRGOrStop <- function(object) { if (!is(object, "RGChannelSet")) { stop("object is of class '", class(object), "', but needs to be of ", "class 'RGChannelSet' or 'RGChannelSetExtended'") } } .isGenomicOrStop <- function(object) { if (!is(object, "GenomicMethylSet") && !is(object, "GenomicRatioSet")) { stop("object is of class '", class(object), "', but needs to be of ", "class 'GenomicMethylSet' or 'GenomicRatioSet'") } } #---- Chapter 4: Second Methodology ---- # Need minfi package for supplementary functions for 450K data. # Need pcaPP pacakge for faster computation of Kendall's Tau. # Need VineCopula package to calculate parameter of copula from # Kendall's Tau. # Need pls package for efficient partial least squares computation. # Need bivariate package for efficient computation of joint probabilities library(minfi) library(VineCopula) library(pcaPP) library(bivariate) library(pls) #---- Functions --- # Getting the probabilities that stem from Methylated values for faster # computation getmeth.probs = function(Meth){ n = ncol(Meth) Fhat = vector("list",n) Fhat1 = matrix(NA, nrow=nrow(Meth), ncol=n) for (i in 1:n) { Fhat[[i]] = ecdf(Meth[,i]) } for (i in 1:n) { Fhat1[,i] = Fhat[[i]](Meth[,i]) } return(Fhat1) } # This function retrieves the joint probabilities for each probe # for each sample. get.bivariate.probabilities = function(Meth, Unmeth){ n = ncol(Meth) Fhat = vector("list",n) Fhat1 = matrix(NA, nrow=nrow(Meth), ncol=n) for (i in 1:n) { Fhat[[i]] = ebvcdf(Meth[,i],Unmeth[,i]) } for (i in 1:n) { Fhat1[,i] = Fhat[[i]](Meth[,i], Unmeth[,i]) } return(Fhat1) } # We now retrieve the probability points discussed in Chapter 4.2 # by letting v =1-u for the copula C(u,v) bivariate.quantiles.frank.final.2 = function(Meth, Unmeth, alpha){ n = ncol(Meth) Meth.sort = apply(Meth, 2, sort) Unmeth.sort = apply(Unmeth, 2, sort) bivariate.list = c() for (i in 1:n){ m = length(Meth[,i]) tau = cor.fk(Meth[,i],Unmeth[,i]) theta = BiCopTau2Par(family = 5, tau = tau) a = exp(theta*(alpha - 1)) - exp(theta*alpha) + 1 b = -2*exp(-theta) c = exp(-theta) roots = (log((-b + sqrt(b^2 -4*a*c))/(2*a))) / (-theta) roots2 = 1 - roots roots[roots == 0] <- 1/m roots2[roots2 == 0] <-1/m quants.meth = Meth.sort[,i][m*roots] quants.unmeth = Unmeth.sort[,i][m*roots2] b.quants = cbind(quants.meth,quants.unmeth) colnames(b.quants) = c(paste(colnames(Meth)[i], "Meth" ,sep = '.'), paste(colnames(Unmeth)[i], "Unmeth", sep = '.')) bivariate.list = cbind(bivariate.list, b.quants) } return(bivariate.list) } # For the methodology described in 4.2, we retrieve the minimums and maximums # instead of setting the minimums to 0 and the maximums to the second highest # value + 1000 returnNewQuantiles2 = function(controlMatrix, quantiles, ncomp){ n = ncol(quantiles) newQuantiles = array(0L, dim(quantiles)) for (i in 1:nrow(quantiles)){ quant = matrix(quantiles[i,], n/2,2, byrow = T) meanFunction = colMeans2(quant) res = sweep(quant,2,meanFunction) fit = plsr(res ~ controlMatrix,ncomp = ncomp) newfit = sweep(fit$residuals[, , paste(as.character(ncomp), 'comps')],2,-meanFunction) newQuantiles[i,] = as.vector(newfit) } newQuantiles = .regularizeQuantiles(newQuantiles) #Putting row and column names for convenience, in case if you want to look nam = c() nam = append(nam,colnames(quantiles)[seq(1,n-1,2)]) nam = append(nam,colnames(quantiles)[seq(2,n,2)]) colnames(newQuantiles) = nam row.names(newQuantiles) = row.names(quantiles) return(newQuantiles) } # We do this regularization in this way since the values in these # bivariate probability points are bidirectional, ie when # one value increases, the other value decreases .regularizeQuantiles <- function(x){ n = ncol(x) /2 x[x<0] <- 0 x = cbind(colCummaxs(x[,1:n]), colCummins(x[,(n+1):(2*n)])) } # We now take get the adjustments that were done to the quantiles # by the PLS regression and store them adjustment = function(oldQuantiles, newQuantiles){ n = ncol(oldQuantiles) old.meth = oldQuantiles[,seq(1,(n - 1),by=2)] old.unmeth = oldQuantiles[,seq(2,n,by=2)] #browser() oldquants = cbind(old.meth,old.unmeth) adjust = newQuantiles - oldquants return(adjust) } # The adjustment process described in the first and # second Variation Methodologies normalize.bivariate = function(Meth, Unmeth, joint.ecdf, quantiles.edf, adjustments){ n = ncol(Meth) m = nrow(Meth) s = dim(quantiles.edf)[1] adjust.meth = adjustments[,1:n] adjust.unmeth = adjustments[,(n+1):(2*n)] Meth.new = matrix(NA, nrow=m, ncol = n) Unmeth.new = matrix(NA, nrow=m, ncol = n) for (i in 1:n){ for (j in 1:(s-1)){ f2 = quantiles.edf[(j+1),i] f1 = quantiles.edf[j,i] indices = which( f1<=joint.ecdf[,i] & joint.ecdf[,i] < f2) fhat = joint.ecdf[indices, i] y = cbind(Meth[indices,i],Unmeth[indices,i]) adjust.k2 = cbind(adjust.meth[(j+1),i], adjust.unmeth[(j+1),i]) adjust.k1 = cbind(adjust.meth[j,i], adjust.unmeth[j,i]) weight = (fhat - f1) / (f2 - f1) diff.k = adjust.k2 - adjust.k1 adjust.y = cbind(adjust.k1[1] + weight*diff.k[1], adjust.k1[2] + weight*diff.k[2]) yhat = y + adjust.y # Methylated and Unmethylated values must be >= 0 yhat[yhat<0] <-0 Meth.new[indices, i] = yhat[,1] Unmeth.new[indices, i] = yhat[,2] } } return(cbind(Meth.new,Unmeth.new)) } # ---- Applying the methodology to the Mania dataset ---- # Retrieving the dataset from where is was stored RGSet <- read.metharray.exp('D:/methyldatamedium/methyldatamedium') # Getting the phenotypes of the samples (whether a patient has Mania or not) type.mania = c() for (i in seq(1,80, by =2)){ l = as.character(type_mania[i,1]) type.mania = append(type.mania, l) } # We follow the steps done from FunNorm to the data prior # to normalization. We use the functions from the Chapter 3 # methodology to get what we need gmSet <- preprocessNoob(RGSet, dyeCorr = T) gmSet <- mapToGenome(gmSet) extractedData = .extractFromRGSet450k(RGSet) mod.mat = .buildControlMatrix450k(extractedData) indices = .getFunnormIndices(gmSet) Meth.cor = getMeth(gmSet) Unmeth.cor = getUnmeth(gmSet) probs <- seq(from = 0, to = 1, length.out = 501) #---- Type I Green probes ---- mat1.meth = Meth.cor[indices[[1]],,drop=FALSE] mat1.unmeth = Unmeth.cor[indices[[1]],,drop=FALSE] Meth.green.probs = getmeth.probs(mat1.meth) system.time({joint.edf.green = get.bivariate.probabilities(mat1.meth,mat1.unmeth)}) # user system elapsed # 1017.23 2.57 1044.25 # Using the law of total probability P(a) = P(a & b) + P(a & b^c) to get # P(MethU) joint.prob.green.2 = Meth.green.probs - joint.edf.green # Probability points stemming from v = 1-u and the corresponding adjustments quants.green.2 = bivariate.quantiles.frank.final.2(mat1.meth, mat1.unmeth, probs) newQuantiles.green.2 = returnNewQuantiles2(controlMatrix = mod.mat, quantiles = quants.green.2, ncomp = 3) edf.quantiles.green.2 = joint.edf.of.quantiles.2(mat1.meth,mat1.unmeth, quants.green.2) adjust.green.2 = adjustment(oldQuantiles = quants.green.2, newQuantiles = newQuantiles.green.2) #Second Variation Normalization fit1.green = normalize.bivariate(mat1.meth,mat1.unmeth, joint.ecdf = joint.prob.green.2, quantiles.edf = edf.quantiles.green.2, adjustments = adjust.green.2) # Getting the Methylated and Unmethylated signals fitting.green.meth = fitting.green[,1:40] fitting.green.unmeth = fitting.green[,41:80] # beta-values fit.beta.green = fitting.green.meth / (fitting.green.meth + fitting.green.unmeth +100) #plot densityPlot(fit.beta.green, sampGroups = type.mania) #---- Type I Red Probes ---- mat2.meth = Meth.cor[indices[[2]],,drop=FALSE] mat2.unmeth = Unmeth.cor[indices[[2]],,drop=FALSE] Meth.red.probs = .getmeth.probs(mat2.meth) system.time({joint.prob.red.1 = .get.bivariate.probabilities(mat2.meth,mat2.unmeth)}) #user system elapsed #3755.82 157.16 3977.30 # Using the law of total probability P(a) = P(a & b) + P(a & b^c) to get # P(MethU) joint.prob.red.2 = Meth.red.probs - joint.prob.red.1 # Probability points stemming from v = 1-u and the corresponding adjustments quants.red.2 = .bivariate.quantiles.frank.final.2(mat2.meth, mat2.unmeth, probs) newQuantiles.red.2 = .returnNewQuantiles2(controlMatrix = mod.mat, quantiles = quants.red.2, ncomp = 3) edf.quantiles.red.2 = .joint.edf.of.quantiles.2(mat2.meth,mat2.unmeth, quants.red.2) adjust.red.2 = .adjustment(oldQuantiles = quants.red.2, newQuantiles = newQuantiles.red.2) # Second Variation Normalization fit1.red = normalize.bivariate(mat2.meth,mat2.unmeth, joint.ecdf = joint.prob.red.2, quantiles.edf = edf.quantiles.red.2, adjustments = adjust.red.2) # Methylated and Unmethylated values fitting.red.meth = fit1.red[,1:40] fitting.red.unmeth = fit1.red[,41:80] # Beta-values fit.beta.red = fitting.red.meth / (fitting.red.meth + fitting.red.unmeth +100) #plot densityPlot(fit.beta.red, sampGroups = type.mania)