################### # Novel Binomial Method # Generates from correlated binomials according to the input parameters # Uses an optimization method to get an appropriate pmf to generate # multivariate binomial values directly # Outputs a matrix of observations GenerateAnySnpOptParamBinomial <- function( pAllele = c(0.1, 0.1, 0.1), matTargetCor = diag(3), nObs = 500 ){ # pAllele = The minor allele frequency of each SNP in order # matTargetCor = The target correlation matrix # nObs = the number of observations to generate # Generate integer values from the pmf generated by the method pmf <- calcOptPmfAnySnpBinomial(pAllele, matTargetCor) if (isTRUE(all.equal(sum(pmf), 0))) { return(matrix(NA, ncol = length(pAllele), nrow = nObs)) } genIntVals <- genBag(nObs, pmf) # Constructs the Map between the integer values and the binomial values # for each SNP. # Each column is a SNP, each row is a value, # each entry is the appropriate # binomial value for the integer value. # Note that the first column of the map matrix corresponds to the last snp, # The second column of the map matrix is the second last snp... # .... # The last column of the map matrix is the first snp # Snp naming convention is # Example of naming convention for 3 SNPs together # 111, 112, 113, 121, 122, 123, 131, 132, 133, # 211, 212, 213, 221, 222, 223, 231, 232, 233 # 311, 312, 313, 321, 322, 323, 331, 332, 333 # Corresponding values are # 1, 2, 3, 4, 5, 6, 7, 8, 9, # 10, 11, 12, 13, 14, 15, 16, 17, 18 # 19, 20, 21, 22, 23, 24, 25, 26, 27 numSnp <- length(pAllele) mapIntToBin <- sapply(c(1:numSnp), function(snpInd){ inner <- 3 ^ (snpInd - 1) outer <- 3 ^ (numSnp - snpInd) eexp <- snpInd - 1 toAdd <- rep( c(rep(0, inner), rep(1, inner), rep(2, inner)), outer ) return(toAdd) }) # Reverse Ordering so SNP1 corresponds to column 1 mapIntToBinOrdered <- mapIntToBin[,ncol(mapIntToBin):1] # use a simple apply to get the binomial values using the mapping multiBinomialVals <- apply(mapIntToBinOrdered, 2, function(colMap){ colMap[genIntVals] }) return(multiBinomialVals) } # Sets up the equations associated with the marginal binomial distributions # Outputs a matrix calcMatMarginalBinomial <- function(numSnp = 3, pAlleles = c(0.1, 0.1, 0.1)) { # Calculates the constraint matrix associated to the marginal equations # numSnp = the number of snps # pAlleles = the minor allele frequency for each snp # Returns a typical constraint matrix where the last column # is the RHS of all the equations and the LHS # denotes the coefficient on each variable # The parameters are the the probability of each element in the support of the # target random variable. # First, construct the Left hand side (LHS) of the equations snpMatLHS <- matrix(double(), 0, (3 ^ numSnp)) snpVecRHS <- vector("double", 0) for (indSnp in c(1:numSnp)) { # For each snp,we separately work on the values they take on # Note that we let the SNP value take on (1,2,3) instead of (0,1,2) # so we don't need to deal with some index issues # construct the three rows associated to a particular SNP snpRows <- t(sapply(c(1:3), function(valSnp) { # Example of naming convention for 3 SNPs together # 111, 112, 113, 121, 122, 123, 131, 132, 133, # 211, 212, 213, 221, 222, 223, 231, 232, 233 # 311, 312, 313, 321, 322, 323, 331, 332, 333 # If we're interested in finding the SNPs that represent the marginals # i.e. if we want the marginal for SNP1 = 1, # then we want all of the above naming convention that # have a 1 in the first place (1??) # i.e. if we want the marginal for SNP2 = 3, # then we want all of the above naming convention that # have a 3 in the second place (?3?) # We can automate the the way we find the marginals. # Note that the SNPs are ordered lowest to highest # (i.e. 111, 112, ..., 332, 333) # There will be a specific number of blocks # This number is related to the index of the SNP we're looking at numBlocks <- 3 ^ (indSnp - 1) # we will have a slew/block of 1's blockOneWidth <- 3 ^ (numSnp - indSnp) # The space between the first block of 1's is # dependent on the value/snp of interest blockStartZeros <- (valSnp - 1) * blockOneWidth # The block of 1's will always be followed by a block of 0's blockEndZeros <- (3 - valSnp) * blockOneWidth ValBlock <- c(rep(0, blockStartZeros), rep(1, blockOneWidth), rep(0, blockEndZeros)) ValRow <- rep(ValBlock, numBlocks) return(ValRow) })) # Append the 3 rows for a particular SNP to the matrix of constraints snpMatLHS <- rbind(snpMatLHS, snpRows) # Append the RHS of what the above 3 rows should equal pSnp <- pAlleles[indSnp] snpVecRHS <- c(snpVecRHS, #Previous values (1 - pSnp) ^ 2, #1 2 * (1 - pSnp) * pSnp, #2 pSnp ^ 2) #3 } # Combine into a matrix and return snpMatConstraints <- as.matrix(cbind(snpMatLHS, snpVecRHS)) return(snpMatConstraints) } # Sets up the equations associated to the target correlation constraints # Outputs a matrix calcMatTargetCorBinomial <- function(numSnp = 3, matTargetCor = diag(3), pAlleles = c(0.1, 0.1, 0.1)) { # Calculates the constraint matrix associated to the # Target Correlation equations # numSnp = the number of snps # matTargetCor = the target correlation matrix # pAlleles = the alleles associated to each snp # Returns a typical constraint matrix where the last # column is the RHS of all the equations and the LHS # denotes the coefficient on each variable # The variables are the frequencies of the multivariate SNPs. # The snp values that are used in target correlation calculation valSnpXYmat <- matrix(c(2, 2, 3, 3, 2, 3, 2, 3), 2, 4, T) # The coefficients to be placed on the variables valXYTargetCor <- c(1, 2, 2, 4) # We will be appending to construct the matrix snpLHSMatTarCor <- matrix(double(), 0, (3 ^ numSnp)) snpRHSVecTarCor <- vector("double", 0) for (indSnpX in c(1:numSnp)) { for (indSnpY in c((indSnpX):numSnp)) { if (isTRUE(all.equal(indSnpX, indSnpY))) next #Skip if we look a SNP paired with itself # Looking at all pairs of SNPs # SnpX is the on a lower index (more left) than SnpY # For each pair of SNPs # we will assemble the row for each value used in the target correlation # separately # Construct a matrix where each row indicates where to place # The appropriate value a single pair of values # involved in the target correlation # i.e. the snp values of 2-2, 2-3, 3-2 and 3-3 are # all used in the target correlation # The contribution of each pair is calculated separately and # then added to get a separate row separateConstraint <- t(sapply(c(1:4), function(indXY) { valSnpX <- valSnpXYmat[1, indXY] valSnpY <- valSnpXYmat[2, indXY] valPlace <- valXYTargetCor[indXY] bigBlockNum <- 3 ^ (indSnpX - 1) bigBlockWidth <- 3 ^ (numSnp - indSnpX) bigBlockStartZeros <- (valSnpX - 1) * bigBlockWidth # Constructing smaller block inside big block smlBlockNum <- 3 ^ (indSnpY - indSnpX - 1) smlBlockoneWidth <- 3 ^ (numSnp - indSnpY) smlBlockStartZeros <- (valSnpY - 1) * smlBlockoneWidth smlBlockOnes <- smlBlockoneWidth smlBlockEndZeros <- (3 - valSnpY) * smlBlockoneWidth smlblock <- c( rep(0, smlBlockStartZeros), rep(valPlace, smlBlockOnes), rep(0, smlBlockEndZeros) ) smlBlockRow <- rep(smlblock, smlBlockNum) bigBlockOnes <- smlBlockRow bigBlockEndZeros <- (3 - valSnpX) * bigBlockWidth # Constructing final big block bigBlock <- c(rep(0, bigBlockStartZeros), rep(smlBlockRow, 1), rep(0, bigBlockEndZeros)) bigBlockRow <- rep(bigBlock, bigBlockNum) return(bigBlockRow) })) # collapse the above matrix into a row vector to be # used in linear optimization singleRowConstraint <- apply(separateConstraint, 2, sum) snpLHSMatTarCor <- rbind(snpLHSMatTarCor, singleRowConstraint) # RHS of target correlation is targetCor * sqrt(VarX*VarY) + eX*EY pX <- pAlleles[indSnpX] pY <- pAlleles[indSnpY] nXY <- 2 rhseXYTarCor <- matTargetCor[indSnpX, indSnpY] * sqrt(nXY * pX * (1 - pX) * nXY * pY * (1 - pY)) + (nXY * pX * nXY * pY) snpRHSVecTarCor <- c(snpRHSVecTarCor, rhseXYTarCor) } } snpMatTarCor <- as.matrix(cbind(snpLHSMatTarCor, snpRHSVecTarCor)) # snpMatTarCor return(snpMatTarCor) } # Combines the marginal and target correlation constraints into a single matrix # Outputs a matrix calcMatPmfConstaintBinomial <-function(numSnp = 3, matTargetCor = diag(3), pAlleles = c(0.1, 0.1, 0.1)) { # Calculates the constraint matrix associated to # the Target Correlation equations # numSnp = the number of snps # matTargetCor = the target correlation matrix # pAlleles = the alleles associated to each snp # Returns A typical constraint matrix where the # last column is the RHS of all the equations and the LHS # denotes the coefficient on each variable # The variables are the frequencies of the multivariate SNPs. matMarg <- calcMatMarginalBinomial(numSnp, pAlleles) matTarC <- calcMatTargetCorBinomial(numSnp, matTargetCor, pAlleles) snpMat <- as.matrix(rbind(matMarg, matTarC)) return(snpMat) } # Calculates the pmf based on optimizing the set of constraints # as given by the marginals and target correlation # Outputs a vector that contains the probability of each element calcOptPmfAnySnpBinomial <- function(pAllele, matTarCor){ require(lpSolve) # pAllele = vector in minor allele frequencies # matTarCor = correlation matrix numSnp <- length(pAllele) oriMatSnp <- calcMatPmfConstaintBinomial(length(pAllele), matTarCor, pAllele) # Need to separate matrix to LHS and RHS and add the unity constraint LHSOriMatSnp <- oriMatSnp[,-ncol(oriMatSnp)] RHSOriMatSnp <- as.vector(oriMatSnp[,ncol(oriMatSnp)]) # Add unity constraint to the above. # RHS only needs a 1 added uniRHSOriMatSnp <- c(RHSOriMatSnp, 1) # LHS needs an extra row and column, last diagonal entry needs to be 1 uniLHSOriMatSnp1 <- rbind(LHSOriMatSnp, 0) uniLHSOriMatSnp2 <- cbind(uniLHSOriMatSnp1, 0) uniLHSOriMatSnp2[nrow(uniLHSOriMatSnp2), ncol(uniLHSOriMatSnp2)] <- 1 uniLHSOriMatSnp <- uniLHSOriMatSnp2 # Objective function # All of the variables added should equal the unity (last) variable obj <- c( rep(1,ncol(LHSOriMatSnp)) ,-1) constranints_direction <- rep("=", length(uniRHSOriMatSnp)) optimum <- lp(direction = "max", objective.in = obj, const.mat = uniLHSOriMatSnp, const.dir = constranints_direction, const.rhs = uniRHSOriMatSnp, all.int = F) sol <- optimum$solution # Remove last entry since that's the total sum # Remaining vector contains the entries in the PMF of the target # random variable pmfSolve <- sol[-length(sol)] # Name the vector according to the naming convention names(pmfSolve) <- namingForBinomialPmf(numSnp) return(pmfSolve) } # Gets the name for each entry in the PMF of correlated binomials # Outputs a vector of names (string) namingForBinomialPmf <- function(numSnp){ # numSNP = number of SNPs # Naming convention follows a specific pattern # they are the ordered (lowest to highest) of what # value (1,2,3) each snp (indicated by index) takes on # For example, if snp1 is 1, snp2 is 3 and snp3 is 2, # then the corresponding name is 132. # Construct the names from right to left. # The pattern of 1,2,3 is a repetitive one. nmedoub <- rep(0, 3^numSnp) for (snpInd in c(1:numSnp)) { # snpInd <- 1 inner <- 3 ^ (snpInd - 1) outer <- 3 ^ (numSnp - snpInd) eexp <- snpInd - 1 toAdd <- (10^eexp)*rep( c(rep(1, inner), rep(2, inner), rep(3, inner)), outer ) toAdd nmedoub <- nmedoub + toAdd } nmestri <- paste("x", nmedoub, sep = "") nmestri } # Generates integer given the probability for each integer # nObs = number of observations to generate # pGeno = vector of probability weights # Outputs a vector of integers genBag <- function(nObs, pGeno){ if (!all.equal(sum(pGeno), 1)) print("Given probabilities don't add to 1 ") # Generate and return values genBagVals <- sample.int(length(pGeno), nObs , replace=TRUE, prob=pGeno) return(genBagVals) } # Generates correlated Bernoulli's according to the input parameters # Uses the optimization method to get an appropriate # Independently generates twice the number of multivariate bernoulli values # such that they can be added component wise to get # binomial marginal distributions ###################### # Novel bernoulli Method # Outputs a matrix of observations GenerateAnySnpOptParamBernoulli <- function( pAllele = c(0.1, 0.1, 0.1), matTargetCor = diag(3), nObs = 500 ){ # pAllele = The minor allele frequency of each SNP in order # matTargetCor = The target correlation matrix # nObs = the number of observations to generate # Generate integer values from the pmf generated by the method # We'll generate all the needed bernoulli values at once, # separate them at the end and add them to get binomial values pmf <- calcOptPmfAnySnpBernoulli(pAllele, matTargetCor) if (isTRUE(all.equal(sum(pmf), 0))) { return(matrix(NA, ncol = length(pAllele), nrow = nObs)) } genIntVals <- genBag(2*nObs, pmf) # Constructs the Map between the integer values and the binomial values # for each SNP. # Each column is a SNP, each row is an # integer value, each entry is the appropriate binomial value for # the integer value. # Note that the first column of the map matrix corresponds to the last snp, # The second column of the map matrix is the second last snp... # .... # The last column of the map matrix is the first snp # Snp naming convention is # Example of naming convention for 3 SNPs together # 111, 112, 121, 122, 211, 212, 221, 222 # Corresponding integer values are # 1, 2, 3, 4, 5, 6, 7, 8 numSnp <- length(pAllele) mapIntToBern <- sapply(c(1:numSnp), function(snpInd){ inner <- 2 ^ (snpInd - 1) outer <- 2 ^ (numSnp - snpInd) eexp <- snpInd - 1 toAdd <- rep( c(rep(0, inner), rep(1, inner)), outer ) return(toAdd) }) # Reverse Ordering so SNP1 corresponds to column 1 mapIntToBernOrdered <- mapIntToBern[,ncol(mapIntToBern):1] # use a simple apply to get the bernoulli values using the mapping multiBernVals <- apply(mapIntToBernOrdered, 2, function(colMultiMap){ colMultiMap[genIntVals] }) # Add the halves of the bernoulli values together to get binomial values multiBinVals <- apply(multiBernVals, 2, function(colBernVal){ (colBernVal[1:nObs] + colBernVal[(nObs + 1): length(colBernVal)]) }) return(multiBinVals) } # Based on Adding two bernoulli's # Sets up the equations associated with the marginal constraints # Outputs a matrix calcMatMarginalBernoulli <- function(numSnp = 3, pAlleles = c(0.1, 0.1, 0.1)) { # Calculates the constraint matrix associated to the marginal equations # numSnp = the number of snps # pAlleles = the minor allele frequency for each snp # Returns a typical constraint matrix where the # last column is the RHS of all the equations and the LHS # denotes the coefficient on each variable # The variables are the frequencies of the multivariate SNPs. # See below for the naming convention. # First, construct the Left hand side (LHS) of the equations snpMatLHS <- matrix(double(), 0, (2 ^ numSnp)) snpVecRHS <- vector("double", 0) for (indSnp in c(1:numSnp)) { # For each snp,we separately work on the values they take on # Note that we let the SNP value take on (1,2,3) instead of (0,1,2) # so we don't need to deal with some index issues # construct the three rows associated to a particular SNP snpRows <- t(sapply(c(1:2), function(valSnp) { # Example of naming convention for 3 SNPs together # 111, 112, 121, 122, 211, 212, 221, 222 # If we're interested in finding the SNPs that represent the marginals # i.e. if we want the marginal for SNP1 = 0, # then we want all of the above naming convention that have a 1 # in the first place (1) # i.e. if we want the marginal for SNP2 = 2, # then we want all of the above naming convention that # have a 3 in the second place # We can automate the the way we find the marginals. # Note that the SNPs are ordered lowest to highest # (i.e. 111, 112, ..., 221, 222) # There will be a specific number of blocks # This number is related to the index of the SNP we're looking at numBlocks <- 2 ^ (indSnp - 1) # we will have a slew/block of 1's blockOneWidth <- 2 ^ (numSnp - indSnp) # The space between the first block of 1's is dependent # on the value/snp of interest blockStartZeros <- (valSnp - 1) * blockOneWidth # The block of 1's will always be followed by a block of 0's blockEndZeros <- (2 - valSnp) * blockOneWidth ValBlock <- c(rep(0, blockStartZeros), rep(1, blockOneWidth), rep(0, blockEndZeros)) ValRow <- rep(ValBlock, numBlocks) return(ValRow) })) # Append the 2 rows for a particular SNP to the matrix of constraints snpMatLHS <- rbind(snpMatLHS, snpRows) # Append the RHS of what the above 2 rows should equal pSnp <- pAlleles[indSnp] snpVecRHS <- c(snpVecRHS, #Previous values (1 - pSnp), #1 pSnp) #2 } # Combine into a matrix and return snpMatConstraints <- as.matrix(cbind(snpMatLHS, snpVecRHS)) return(snpMatConstraints) } # Sets up the equations associated to the target correlation constraints # Outputs a matrix calcMatTargetCorBernoulli <- function(numSnp = 3, matTargetCor = diag(3), pAlleles = c(0.1, 0.1, 0.1)) { # Calculates the constraint matrix associated to the # Target Correlation equations # numSnp = the number of snps # matTargetCor = the target correlation matrix # pAlleles = the alleles associated to each snp # Returns a typical constraint matrix where the last # column is the RHS of all the equations and the LHS # denotes the coefficient on each variable # The variables are the frequencies of the multivariate # SNPs. See below for the naming convention. # The snp values that are used in target correlation calculation valSnpXYmat <- matrix(c(2, 2), 2, 1, T) # The coefficients to be placed on the variables valXYTargetCor <- c(1) # We will be appending to construct the matrix snpLHSMatTarCor <- matrix(double(), 0, (2 ^ numSnp)) snpRHSVecTarCor <- vector("double", 0) for (indSnpX in c(1:numSnp)) { for (indSnpY in c((indSnpX):numSnp)) { if (isTRUE(all.equal(indSnpX, indSnpY))) next #Skip if we look a SNP paired with itself # Looking at all pairs of SNPs # SnpX is the on a lower index (more left) than SnpY # Construct a matrix where each row indicates where to place # The appropriate value a single pair of values involved # in the target correlation # i.e. the snp values of 2-2, 2-3, 3-2 and 3-3 are # all used in the target correlation # The contribution of each pair is calculated separately # and then added to get a separate row separateConstraint <- t(sapply(c(1), function(indXY) { valSnpX <- valSnpXYmat[1, indXY] valSnpY <- valSnpXYmat[2, indXY] valPlace <- valXYTargetCor[indXY] bigBlockNum <- 2 ^ (indSnpX - 1) bigBlockWidth <- 2 ^ (numSnp - indSnpX) bigBlockStartZeros <- (valSnpX - 1) * bigBlockWidth # Constructing smaller block inside big block smlBlockNum <- 2 ^ (indSnpY - indSnpX - 1) smlBlockoneWidth <- 2 ^ (numSnp - indSnpY) smlBlockStartZeros <- (valSnpY - 1) * smlBlockoneWidth smlBlockOnes <- smlBlockoneWidth smlBlockEndZeros <- (2 - valSnpY) * smlBlockoneWidth smlblock <- c( rep(0, smlBlockStartZeros), rep(valPlace, smlBlockOnes), rep(0, smlBlockEndZeros) ) smlBlockRow <- rep(smlblock, smlBlockNum) bigBlockOnes <- smlBlockRow bigBlockEndZeros <- (2 - valSnpX) * bigBlockWidth # Constructing final big block bigBlock <- c(rep(0, bigBlockStartZeros), rep(smlBlockRow, 1), rep(0, bigBlockEndZeros)) bigBlockRow <- rep(bigBlock, bigBlockNum) return(bigBlockRow) })) # collapse the above matrix into a row vector to be # used in linear optimization singleRowConstraint <- apply(separateConstraint, 2, sum) snpLHSMatTarCor <- rbind(snpLHSMatTarCor, singleRowConstraint) # RHS of target correlation is targetCor * sqrt(VarX*VarY) + eX*EY pX <- pAlleles[indSnpX] pY <- pAlleles[indSnpY] nXY <- 1 rhseXYTarCor <- matTargetCor[indSnpX, indSnpY] * sqrt(nXY * pX * (1 - pX) * nXY * pY * (1 - pY)) + (nXY * pX * nXY * pY) snpRHSVecTarCor <- c(snpRHSVecTarCor, rhseXYTarCor) } } snpMatTarCor <- as.matrix(cbind(snpLHSMatTarCor, snpRHSVecTarCor)) return(snpMatTarCor) } # Combines the marginal and target correlation constraints into a single matrix # Outputs a matrix calcMatPmfConstaintBernoulli <- function(numSnp = 3, matTargetCor = diag(3), pAlleles = c(0.1, 0.1, 0.1)) { # Calculates the constraint matrix associated to the # Target Correlation equations # numSnp = the number of snps # matTargetCor = the target correlation matrix # pAlleles = the alleles associated to each snp # Returns A typical constraint matrix where the last column # is the RHS of all the equations and the LHS # denotes the coefficient on each variable # The variables are the frequencies of the multivariate SNPs. matMarg <- calcMatMarginalBernoulli(numSnp, pAlleles) matTarC <- calcMatTargetCorBernoulli(numSnp, matTargetCor, pAlleles) snpMat <- as.matrix(rbind(matMarg, matTarC)) return(snpMat) } # Calculates the pmf based on optimizing the set of constraints as # given by the marginals and target correlation # Outputs a vector containing the probability of each element calcOptPmfAnySnpBernoulli <- function(pAllele, matTarCor){ require(lpSolve) # pAllele = vector of minor allele frequencies # matTarCor = correlation matrix numSnp <- length(pAllele) oriMatSnp <- calcMatPmfConstaintBernoulli(length(pAllele), matTarCor, pAllele) # Need to separate matrix to LHS and RHS and add the unity constraint LHSOriMatSnp <- oriMatSnp[,-ncol(oriMatSnp)] RHSOriMatSnp <- as.vector(oriMatSnp[,ncol(oriMatSnp)]) # Add unity constraint to the above. # RHS only needs a 1 added uniRHSOriMatSnp <- c(RHSOriMatSnp, 1) # LHS needs an extra row and column, last diagonal entry needs to be 1 uniLHSOriMatSnp1 <- rbind(LHSOriMatSnp, 0) uniLHSOriMatSnp2 <- cbind(uniLHSOriMatSnp1, 0) uniLHSOriMatSnp2[nrow(uniLHSOriMatSnp2), ncol(uniLHSOriMatSnp2)] <- 1 uniLHSOriMatSnp <- uniLHSOriMatSnp2 # Objective function # All of the variables added should equal the unity (last) variable obj <- c( rep(1,ncol(LHSOriMatSnp)) ,-1) constranints_direction <- rep("=", length(uniRHSOriMatSnp)) optimum <- lp(direction = "max", objective.in = obj, const.mat = uniLHSOriMatSnp, const.dir = constranints_direction, const.rhs = uniRHSOriMatSnp, all.int = F) sol <- optimum$solution # Remove last entry since that's the total sum pmfSolve <- sol[-length(sol)] # Name the vector according to naming convention names(pmfSolve) <- namingForBernoullipmf(numSnp) return(pmfSolve) } # Gets the name for each entry in the PMF of correlated Bernoullis namingForBernoullipmf <- function(numSnp){ # Naming convention follows a specific pattern # they are the ordered (lowest to highest) of what value (1,2) # each snp (indicated by index) takes on # For example, if snp1 is 1, snp2 is 1 and snp3 is 2, # then the corresponding name is 112. # Construct the names from right to left. # The pattern of 1,2 is a repetitive one. nmedoub <- rep(0, 2^numSnp) for (snpInd in c(1:numSnp)) { # snpInd <- 1 inner <- 2 ^ (snpInd - 1) outer <- 2 ^ (numSnp - snpInd) eexp <- snpInd - 1 toAdd <- (10^eexp)*rep( c(rep(1, inner), rep(2, inner)), outer ) toAdd nmedoub <- nmedoub + toAdd } nmestri <- paste("x", nmedoub, sep = "") nmestri } # Generates integer given the probability for each integer # nObs = number of observations to generate # pGeno = vector of probability weights # Outputs a vector of integers genBag <- function(nObs, pGeno){ if (!all.equal(sum(pGeno), 1)) print("Given probabilities don't add to 1 ") # Generate and return values genBagVals <- sample.int(length(pGeno), nObs , replace=TRUE, prob=pGeno) return(genBagVals) } ########################### # Kang and Jung 2 SNP method # Simulates a number of observations from # two correlated binomials # Outputs a matrix of observations Generate2SnpBernoulli <- function(p1, p2, rho, nObs){ # p1 minor allele frequency of snp 1 # p2 minor allele frequency of snp 2 # rho correlation between SNP1 and SNP2 # nObs number of observations to generate # Compute the four probabilities for the joint distribution. a0 <- rho * sqrt(p1*p2*(1-p1)*(1-p2)) + (1-p1)*(1-p2) prob <- c( `(0,0)` = a0, `(1,0)` = 1 - p2 - a0, `(0,1)` = 1 - p1 - a0, `(1,1)` = a0 + p1 + p2 - 1 ) # Checks for valid probabilities, skips over if invalid # Negative probabilities due to choice of p1, p2 and # correlation coefficient can cause this if (min(prob) < 0) { return(as.matrix(cbind(NA, NA))) } nbin <- 2 # Sample integers with specific weights u <- sample.int(4, nBinObs * nbin, replace=TRUE, prob=prob) # Split into bernoulli representation y <- floor((u-1)/2) x <- 1 - u %% 2 # Sum up "columns" to get binomial xx <- colSums(matrix(x, nrow=nbin)) # Sum in groups of `n` yy <- colSums(matrix(y, nrow=nbin)) # Sum in groups of `n` return(as.matrix(cbind(xx,yy))) } ################################################### # Madsen and Birkes Method # Simulate observations from correlated binomials using # the Madsen Birkes method # Returns the generated observations in a matrix as well as the # correlations used in the multivariate matrix as additional columns # in the matrix GenerateSnpMadsenBirkes <- function(pAlleleMinor, targetBinCorMat, nBinObs, useEigAdjust = F){ # pAlleleMinor vector of minor allele frequencies # targetBinCorMat Matrix of desired correlations # nBinObs Number of observations to generate require(gdata) require(matrixcalc) require(MASS) nBin <- rep(2, length(pAlleleMinor)) # Calculate correlation matrix for multivariate normal sigmaZ <- CalculateSigmaZ(nBin, pAlleleMinor, targetBinCorMat) # Ensures the correlation matrix is positive definite if desired if (useEigAdjust) { sigmaZ <- CalcAdjustMadsenInverseEigen(sigmaZ) } if (is.symmetric.matrix(sigmaZ)){ if (!is.positive.definite(sigmaZ)){ llen <- length(pAlleleMinor) + ncol(sigmaZ) retVal <- matrix(NA, ncol = llen) return(retVal) } } normMeans <- nBin * 0 # mean is just 0 # Simulate from multivariate normal generatedNormalObs <- mvrnorm(nBinObs, normMeans, sigmaZ) # Convert values from multivariate normal to binomial transformedNormToBinMat <- as.matrix( TransformNormToBin( pAlleleMinor = pAlleleMinor, normalGeneratedMat = generatedNormalObs) ) # Return value retVal <- cbind(transformedNormToBinMat, matrix(upperTriangle(sigmaZ), byrow = T, ncol = length(upperTriangle(sigmaZ)), nrow = nBinObs)) return(as.matrix(retVal)) } # Calculates the correlation matrix used for the multivariate normal # given the target correlation matrix for the correlated binomial random vector # Outputs a matrix CalculateSigmaZ <- function(nBin, pAlleleMinor, corDesire) { # nBin = Should be a vector of just 2's. The number of "trials" # pAlleleMinor = vector of minor allele frequencies # corDesire = correlation matrix for the target r.v. # Calculate correlation matrix for normals corLen <- length(corDesire[,1]) sigmaZ <- diag(length(corDesire[, 1])) # Note: Can speed up by running loop in parallel # Calculates the correlation matrix for (i in c(1:corLen)) { for (j in c(1:corLen)) { # Initialize values needed for calculating delta corD <- corDesire[i, j] pI <- pAlleleMinor[i] pJ <- pAlleleMinor[j] nI <- nBin[i] nJ <- nBin[j] sigmaI2 <- pI * nI * (1 - pI) sigmaJ2 <- pJ * nJ * (1 - pJ) uI <- nI * pI uJ <- nJ * pJ # Calculate the matrix entry by entry # The matrix is symmetric and main diagonal is all 1's if (i == j) { # diagonal elements of correlation matrix sigmaZ[i,j] <- 1 } else if (i < j) { # Compute the entry in the upper triangle part sigmaZ[i,j] <- solveSigmaEqnForNormToBin(corD, sigmaI2, sigmaJ2, uI, uJ, nI, nJ, pI, pJ) } else if (i > j) { # Copy the upper triangle part for the lower triangle sigmaZ[i,j] <- sigmaZ[j,i] } } } return(sigmaZ) } # Numerically solves the what the entry in the correlation matrix # of the multivariate normal should be to get the desired correlation # after transformation to correlated binomial. # Uses optimize to solve the equation. # Equation is from Madsen and Birkes (2013) # Outputs a number value solveSigmaEqnForNormToBin <- function(corDesire, sigmaI2, sigmaJ2, uI, uJ, nI, nJ, pI, pJ){ # corDesire is the correlation desired between two binomial r.v. # sigmaI2, uI, nI, pI are specified parameters for the first binomial r.v. # sigmaJ2, uJ, nJ, pJ are specified parameters for the second binomial r.v. require(mvtnorm) # Initialize the values we will be using sigmaI <- sqrt(sigmaI2) sigmaJ <- sqrt(sigmaJ2) # Support of distributions. 2 Binomials rVals <- c(0,1,2,0,1,2,0,1,2) sVals <- c(0,0,0,1,1,1,2,2,2) # Cdf of R and S values. FiRVec <- pbinom(rVals, nI, pI) FjSVec <- pbinom(sVals, nJ, pJ) # Inverse of Standard Normal of FiR and FiS NInvFiRVec <- qnorm(FiRVec) NInvFjSVec <- qnorm(FjSVec) # Function we will be optimizing to find delta # delta is what the value should be in the correlation matrix for # the multivariate normal fx.opt <- function(deltaInit){ delta <- deltaInit[1] # Function to create bivariate normal functions # with the delta for optimization fx.BivarNorm <- function(nFiR, nFiS) { sigDelta <- matrix(c(1,delta,delta,1), nrow = 2, ncol = 2) pmvnorm(upper = c(nFiR, nFiS), mean = c(0,0), sigma = sigDelta) } # Constructs vector so we can sum the bivariate normal values up BiNormVec <- vector(mode = "double", length = length(rVals)) for (i in c(1:9)) { BiNormVec[i] <- fx.BivarNorm(NInvFiRVec[i], NInvFjSVec[i]) } # Assembling the equation eqSumTermVec <- 1 - FiRVec - FjSVec + BiNormVec rightEqTerm <- 1/(sigmaI*sigmaJ) * (sum(eqSumTermVec) - uI*uJ) leftTerm <- corDesire # What is being optimizd (rightEqTerm - leftTerm)^2 } # Delta is between -1 to 1 opt <- optimize(fx.opt, interval = c(-1,1)) estimatedDelta <- opt$minimum return(estimatedDelta) } # Maps values from the normal distribution to the binomial distribution # based on cut-offs of the multivariate normal # Outputs a matrix of values TransformNormToBin <- function(pAlleleMinor, normalGeneratedMat) { # pAlleleMinor = vector of minor allele frequencies # normalGeneratedMat = matrix of values from multivariate # normal distribution transBinMat <- sapply(c(1:length(pAlleleMinor)), function(indexBin) { x <- normalGeneratedMat[, indexBin] minAllele <- pAlleleMinor[indexBin] # Transform normal marginal to standard uniform unifVal <- pnorm(x, mean = 0, sd = 1) # Map uniform to binomial using the cutoffs binValue <- TransformUnifToBin(minAllele, unifVal) return(binValue) }) return(data.frame(transBinMat)) } # Transforms standard uniform values to binomial(2,p) # given that p is the minor allele frequency TransformUnifToBin <- function(minAllele, unifVal) { # pAlleleMinor = minor Allele Frequency # unifMat = Vector of uniform values unifVal <- as.double(unifVal) minAllele <- as.double(minAllele) # Transform to specified marginal binomial # Cutoffs for mapping uniform to binomial binCutHomoMinor <- minAllele ^ 2 binCutHomoMajor <- (1 - minAllele) ^ 2 binCutHetero <- 2 * minAllele * (1 - minAllele) # Map uniform to binomial using the cutoffs binValue <- ifelse(unifVal < binCutHomoMinor, 2, ifelse(unifVal < binCutHetero + binCutHomoMinor, 1, 0)) return(binValue) } # Takes a matrix and adjusts it to be positive definite # by setting any non-positive eigen values to a small positive number # Outputs a matrix CalcAdjustMadsenInverseEigen <- function(A, tol = 1e-5){ # A is the matrix to adjust # tol is the small positive to set negative eigen values to be eigens <- eigen(A) eigVal <- eigens$values eigVec <- eigens$vectors # A = Q %*% Lambda %*% Q^-1 Q <- eigVec Qinv <- solve(Q) # Adjust the lamda by setting any negatives to some small # positive number eigValAdj <- ifelse(eigVal <= 0, tol, eigVal) lamadj <- diag(eigValAdj, ncol = length(eigVal), nrow = length(eigVal)) # Construct adjusted matrix Aadj <- Q %*% lamadj %*% Qinv # Return matrix as correlation matrix return(cov2cor(Aadj)) }