# load in the files containing the methyaltion data and the source # code containing the SSRPMM functions

################ EXAMPLE ANALYSES TO ILLUSTRATE SS-RPMM ######################## # load in the files containing the methyaltion data and the source # code containing the SSRPMM functions # Note, the SSRPMM Tutorial Objects.Rdata file contains 4 objects, representing # the methlation beta values and covariate data for a head and neck squamous # cell carcinoma (HNSCC) cancer data set (Langevin et al. 2012) and a mesothelioma # cancer data set (Christensen et al. 2009). The four objects are named: Betas_HNSCC, # Covariates_HNSCC, Betas_Mesothelioma, and Covariates_Mesothelioma. Betas_HNSCC is # a N x J data.frame of methylation average-beta values, Covariates_HNSCC is a N x P # data.frame of covariates for the samples in the HNSCC data, Betas_Mesothelioma is # a N x J data.frame of methylation average-beta values, and Covariates_Mesothelioma # is a N x P data.frame of covariates for the samples in the Mesothelioma data. load("ssrpmm Tutorial Objects.Rdata") source("ssrpmm Functions 10-03-2012.R") # Store relevant details regarding the HNSCC and Mesothelioma data J1 = dim(betas_hnscc)[2] #number of CpG loci in the HNSCC dataset N1 = dim(betas_hnscc)[1] #number of samples in the HNSCC dataset P1 = dim(covariates_hnscc)[2] #number of covariate factors in the HNSCC covariate data N1 == dim(covariates_hnscc)[1] #should be true! J2 = dim(betas_mesothelioma)[2] #number of CpG loci in the Mesothelioma dataset N2 = dim(betas_mesothelioma)[1] #number of samples in the Mesothelioma dataset P2 = dim(covariates_mesothelioma)[2] #number of covariate factors in the Mesothelioma covariate data N2 == dim(covariates_mesothelioma)[1] #should be true! # STEP 1: Randomly split the full data into training and testing sets.(1) For the # Mesothelioma data do a stratified random split by tumor histology and # (2) for the HNSCC data do a stratified random split by HNSCC # case/control status. The stratified random split is implemented here # to ensure an approximately equal distribution of subjects between # the training and testing, with respect to a certian variable HNSCCSplit = TrainTestSplit(Betas_HNSCC, Covariates_HNSCC, Strat = "case", seed =1, proptrain = 1/2) MesoSplit = TrainTestSplit(Betas_Mesothelioma, Covariates_Mesothelioma, Strat = "histology", seed =12, proptrain = 1/2) # STEP (1a): Processing the results of the TrainTestSplit function. Extract the # training and testing data from the Split objects above HNSCCTrainingData = HNSCCSplit[[1]] HNSCCTestingData = HNSCCSplit[[2]]

MesoTrainingData = MesoSplit[[1]] MesoTestingData = MesoSplit[[2]] # STEP (1b): Extract the beta values and the covariate values from the training # and testing sets from STEP (1a). # HNSCC Training Data HNSCCTrainingBetas = HNSCCTrainingData[,-(1:P1)] HNSCCTrainingCovariates = HNSCCTrainingData[,(1:P1)] # HNSCC Testing Data HNSCCTestingBetas = HNSCCTestingData[,-(1:P1)] HNSCCTestingCovariates = HNSCCTestingData[,(1:P1)] # Meso Training Data MesoTrainingBetas = MesoTrainingData[,-(1:P2)] MesoTrainingCovariates = MesoTrainingData[,(1:P2)] # Meso Testing Data MesoTestingBetas =MesoTestingData[,-(1:P2)] MesoTestingCovariates = MesoTestingData[,(1:P2)] # STEP 2: Identify the CpG loci that are most associated with the clinical # variable of interest using only the training data. If the clinical # variable of interest is time to event data (i.e. survival or disease recurrence) # then use the function called MostImpCpGsSurvival, otherwise use the # function called MostImpCpGs. This function will provide a ranked list # of CpGs and their corresponding T-score. The CpGs at the top of the # list are those that are most associated with the clinical variable of interest # HNSCC data: clinical variable of interest is "case/control" status HNSCCScores = MostImpCpGs(Y = HNSCCTrainingBetas, covariates = HNSCCTrainingCovariates, clinvar = "case", terms = NULL, factors = NULL) # Meso data: clinical variable of interest is "survival" status MesoScores = MostImpCpGsSurvival(Y = MesoTrainingBetas, covariates = MesoTrainingCovariates, times = "time", censor = "dead", terms = c("age", "sex"), factors = NULL, strat = "histology") # STEP 3a: The objective of this step is to determine the number of high ranking # CpG loci from the previous step to be used in fitting RPMM to the # Training data. Note* if the clinical variable of interest is survival # then use the NestedXValidationSurvival function, otherwise use the # NestedXValidation function. Warning, depending on the selection of mrange # and L, this step can take quite some time. HNSCCXvalidationResults = NestedXValidation(Y = HNSCCTrainingBetas, covariates = HNSCCTrainingCovariates, TScores = HNSCCScores, clinvar = "case", vartype = "binary", mrange = c(5,50), method = "gaussian", L = 20, seeds = 1:20) MesoXvalidationResults = NestedXValidationSurvival(Y = MesoTrainingBetas, covariates = MesoTrainingCovariates, CoxScores = MesoScores, times = "time", censor = "dead", mrange =

c(5,50), method = "gaussian", L = 20, seeds = 1:20) # STEP 3b: The selection of M can be based on what value of m yeilded the lowest median p- value # as indicated in the HNSCCXvalidationResults and MesoXvalidationResults objects. Alternatively, # a better approach to select M would be to smooth the HNSCCXvalidationResults and MesoXvalidationResults objects # and chose the value of M where the smooth function of median p-values attains it's minimum # value mrange = 5:20 loesscurve = loess.smooth(mrange, MesoXvalidationResults, degree = 2) MOpt_Mesothelioma = subset(data.frame(loesscurve$x, loesscurve$y), loesscurve$y == min(loesscurve$y))[[1]] par(mar = c(5,5,4,2)) plot(mrange, MesoXvalidationResults, cex = 0.75, xlab = "Number of top ranking loci (M)", ylab = "Median P-value", cex.lab = 2, cex.axis = 1.5) lines(loesscurve$x, loesscurve$y, lwd = 5) abline(v = MOpt_Mesothelioma, col = "red", lwd = 2, lty = "dashed") # STEP 4: The final step of SS-RPMM is to fit an RPMM to the Training Data using # the M CpG loci with the largest absolute T-Score, where M is determined # based on the results from step 3. Based on this solution, we want to # predict the methylation class membership for the observations in the # test data. We achieve this using the PredMethClasses function. HNSCCMethClassesTesting = PredMethClasses(Ytrain = HNSCCTrainingBetas, Ytest = HNSCCTestingBetas, Scores = HNSCCScores, M = MOpt_HNSCC, method = "gaussian") MesoMethClassesTesting = PredMethClasses(Ytrain = MesoTrainingBetas, Ytest = MesoTestingBetas, Scores = MesoScores, M = MOpt_Mesothelioma, method = "gaussian") ################# POST SS-RPMM ANALYSES #################### # Post SS-RPMM Analysis: Following the prediction of the methylation classes # in the Testing data, we now want to determine whether # or not the methylation classes we've identified are # clinically relevant. We do this by testing the # association between the predicted methylation classes # in the test data and the clinical outcome of interest # (i.e. case/control status for the HNSCC data and survival # for the Meso data) # Some Post SS-RPMM analyses for the HNSCC data # [1] Test whether or not the predicted methylation classes in the test data # are associated with HNSCC case/control status.

permtestchisquare(hnsccmethclassestesting, HNSCCTestingCovariates[,"case"]) # [2] generate a barplot of percent case/control by predicted methylation class RowSum = apply(table(hnsccmethclassestesting, HNSCCTestingCovariates[,"case"]), 1, sum) Table = table(hnsccmethclassestesting, HNSCCTestingCovariates[,"case"]) Percentages = sweep(table, 1, RowSum, "/") barplot(t(percentages), main = "Percent Case/Control by RPMM Class", ylab = "Percent Case/Control", xlab="predicted Class", col=c("darkblue","red")) # [3] Test whether or not the predicted methylation classes in the test data # are associated with HNSCC case/control status, controlling for confounders. HNSCCfit = glm(case~gender+age+packyrs+factor(smk_cfn)+hnsccmethclassestesting, data = HNSCCTestingCovariates, family = binomial(link = "logit")) summary(hnsccfit) # [4] generate a heatmap of the Testing Data by predicted methylation class K = seq(.5,5.5,1) M = 6 HNSCCTestingBetasTopM = as.matrix(hnscctestingbetas[,rownames(hnsccscores)[1:m]]) OrdBeta = hclust(dist(t(hnscctestingbetastopm)), method="ward")$order par(pty = "m", mai = c(1.5,.65,.1,.6)) plotmethbyclass(hnscctestingbetastopm, HNSCCMethClassesTesting, sep = "red", OrdBeta) axis(1, at = K, labels = rownames(hnsccscores)[1:m], las=2,cex.axis=0.8, line = -.5) title(xlab = "Loci used for determining subtypes (M=6)", line = 4.8, outer =F, cex.lab = 1.5) # Some Post SS-RPMM analyses for the Meso data # [1] Kaplan-Meier survival plot by predicted methylation class in the testing # data plot(survfit(surv(time,dead)~ MesoMethClassesTesting, data = MesoTestingCovariates), col = rainbow(length(levels(mesomethclassestesting))), xlab = "Time (months)", ylab = "Probability of Survival", cex.lab = 1.5, cex.axis = 1.2, lwd = 2) legend("bottomleft", legend = levels(mesomethclassestesting), col = rainbow(length(levels(mesomethclassestesting))), cex = 1.2, lty = 1, lwd = 2) # [2] Test whether or not the predicted methylation classes in the test data # are associated with survival time (i.e. log-rank test). survdiff(surv(time,dead)~mesomethclassestesting, data = MesoTestingCovariates) # [3] Test whether or not the predicted methylation classes in the test data # are associated with survival time, controlling for confounders. CoxModMeso = coxph(surv(time,dead)~mesomethclassestesting + age + sex +

strata(histology), data = MesoTestingCovariates) summary(coxmodmeso)