This is the official web companion for the following books:

A Biologist's Guide to Analysis of DNA Microarray Data (Knudsen, S.; Wiley, 2002)
Guide to Analysis of DNA Microarray Data (Knudsen, S.; Wiley, 2004)
Cancer Diagnostics with DNA Microarrays (Knudsen, S.; Wiley, 2006)

R: Public domain data analysis package available for download for a number of different computer platforms at www.r-project.org. There are several online manuals available: for example An Introduction to R (PDF file).

R code for cut-and-paste of examples listed in the books:

Example

  1. Create data file with example in Table 10.1 (12.2). On a Unix/Linux system you would do it like this, followed by a Control-d character:
        cat >example
                 Pat_N1 Pat_N2 Pat_A1 Pat_A2 Pat_B1 Pat_B2
        gene_a      90    110    190    210    290    310
        gene_b     190    210    390    410    590    610
        gene_c      90    110    110     90    120     80
        gene_d     200    100    400     90    600    200
        
  2. Start R (examples below have been tested with version 1.3 and 1.4) by entering the letter R at the prompt.

  3. t-Test: Enter the following commands:
          dataf  <- read.table("example")
        # load library for t-test:
          library(ctest)
        
        # t-test function:
          get.pval.ttest <- function(dataf,index1,index2,
            datafilter=as.numeric){
            f <- function(i) {
              return(t.test(datafilter(dataf[i,index1]),
                datafilter(dataf[i,index2]))$p.value)
            }
            return(sapply(1:length(dataf[,1]),f))
          }
        
        # call function with our data:
          pVal.ttest <- get.pval.ttest(dataf,3:4,5:6)
        
        # print results on screen (only for a small dataset like this)
          print(cbind(dataf,pVal.ttest))
               Pat.N1 Pat.N2 Pat.A1 Pat.A2 Pat.B1 Pat.B2 pVal.ttest
        gene_a     90    110    190    210    290    310 0.01941932
        gene_b    190    210    390    410    590    610 0.00496281
        gene_c     90    110    110     90    120     80 1.00000000
        gene_d    200    100    400     90    600    200 0.60590011
        
        # sort on P-value and write to file:
          orders <- order(pVal.ttest)
          ordered.data <- cbind(dataf[orders,],pVal.ttest[orders])
          write(t(as.matrix(ordered.data)),
           ncolumns=length(dataf)+1,
           file = "ttest.out")
        
        
  4. Wilcoxon: Enter the following commands:
          dataf  <- read.table("example")
        # load library for t-test:
          library(ctest)
        
        # t-test function:
          get.pval.wilcox <- function(dataf,index1,index2,
            datafilter=as.numeric){
            f <- function(i) {
              return(wilcox.test(datafilter(dataf[i,index1]),
                datafilter(dataf[i,index2]))$p.value)
            }
            return(sapply(1:length(dataf[,1]),f))
          }
        
        # call function with our data:
          pVal.wilcox <- get.pval.wilcox(dataf,3:4,5:6)
        
        # print results on screen (only for a small dataset like this)
          print(cbind(dataf,pVal.wilcox))
               Pat.N1 Pat.N2 Pat.A1 Pat.A2 Pat.B1 Pat.B2 pVal.wilcox
        gene_a     90    110    190    210    290    310   0.3333333
        gene_b    190    210    390    410    590    610   0.3333333
        gene_c     90    110    110     90    120     80   1.0000000
        gene_d    200    100    400     90    600    200   0.6666667
        
        # sort on P-value and write to file:
          orders <- order(pVal.ttest)
          ordered.data <- cbind(dataf[orders,],pVal.ttest[orders])
          write(t(as.matrix(ordered.data)),
           ncolumns=length(dataf)+1,
           file = "wilcox.out")
        
        
  5. ANOVA: Enter the following commands:
        dataf  <- read.table("example")
        Categories <- as.factor(c("N","N","A","A","B","B"))
        indexAvgDiff <- 1:6
        
        # ANOVA function by Laurent:
        get.pval.anova <-
        
        function(dataf,indexAll,templateCategories,datafilter=as.numeric){ 
          templateCategories <- as.factor(templateCategories) 
          f <- function(i) { 
            return(summary( 
                           aov(datafilter(dataf[i,indexAll]) ~ templateCategories) 
                           ) 
                   [[1]][4:5][[2]][1]) 
          } 
          return(sapply(1:length(dataf[,1]),f)) 
        }
        
        # call the function, sort and write results:
        pVal.anova <- get.pval.anova(dataf,indexAvgDiff,Categories)
        
        print(cbind(dataf,pVal.anova))
        
               Pat.N1 Pat.N2 Pat.A1 Pat.A2 Pat.B1 Pat.B2   pVal.anova
        gene_a     90    110    190    210    290    310 0.0017965439
        gene_b    190    210    390    410    590    610 0.0002283540
        gene_c     90    110    110     90    120     80 1.0000000000
        gene_d    200    100    400     90    600    200 0.5560965577
        
        orders <- order(pVal.anova)
        ordered.data <- cbind(dataf[orders,],pVal.anova[orders])
        write(t(as.matrix(ordered.data)),ncolumns=length(dataf)+1,file = "ANOVA.out")
        
        
        
  6. PCA: Enter the following commands:
        library(mva)
        dataf  <- read.table("example")
        pca <- princomp(dataf,cor=TRUE,scores=TRUE)
        plot(pca)
        biplot(pca)
        
        
  7. Correspondence Analysis: Enter the following commands:
        
        library(MASS)
        library(mva)
        dataf  <- read.table("example")
        cs <- corresp(dataf,nf=2)
        plot(cs)
        
        
  8. Classification: Download the public SBRCT dataset by Khan et al (2001). Enter the following commands:
        
        library(class)
        library(mva)
        dataf  <- read.table("2308genes63.txt",header=T)
        tposed <- t(dataf)
        knn.targets <- factor( c(rep("E", 23), rep("B", 8), rep("N", 12), rep("R", 20)) )
        knn.cv(tposed, knn.targets, k=3, prob=TRUE)
        E E E E E E E E E E E E E E E E E E E E E E E E
        B B B B B B B B N N N N N N N N N N N N R R R N
        R R R R R E R R R R R R R R R R
        
        
  9. Neural networks: Enter the following commands:
        
        library(nnet)
        tposed <- t(dataf)
        pca <- princomp(tposed,cor=TRUE,scores=TRUE)
        canc <- pca$scores[,1:10]
        targets <- class.ind( c(rep("E", 23), rep("B", 8), rep("N", 12), rep("R", 20)) )
        f <- function(samp) {
             b <- c(0,0,0,0)
             for(i in 1:20) {
                trainednet <- nnet(canc[-samp,], targets[-samp,],
                        size=2,skip=FALSE,trace=FALSE, maxit=300)
                a <- max.col(predict.nnet(trainednet, canc[samp,]))
                b[a] <- b[a] + 1
             }
             print(b)
        }
        
        lapply(1:63,f)
        
        
  10. The affy package:
    Download sample raw CEL files, e.g. GSE3212 From GEO.
        # use version 1.2.30 or higher of the affy package:
          library(affy)
        # read CEL files:
          data <- read.affybatch("GSM71942.CEL.gz","GSM71991.CEL.gz",
          "GSM72243.CEL.gz","GSM72244.CEL.gz","GSM72245.CEL.gz",
          "GSM72246.CEL.gz","GSM72247.CEL.gz","GSM72248.CEL.gz",
          "GSM72249.CEL.gz","GSM72250.CEL.gz")
          
        # background correct, normalize, and condense:
          affy.es <- expresso(data, bgcorrect.method="rma2", 
                     normalize.method = "qspline", 
                     pmcorrect.method="pmonly", 
                     summary.method ="liwong")
          Matrix <- exprs(affy.es)
        
        # t-test function:
           get.pval.ttest <- function(dataf,index1,index2,
                  datafilter=as.numeric){
                  f <- function(i) {
                    return(t.test(datafilter(dataf[i,index1]),
                      datafilter(dataf[i,index2]))$p.value)
                  }
                  return(sapply(1:length(dataf[,1]),f))
            }   
        
        # perform t-test:
          pValues <- get.pval.ttest(Matrix,1:5,6:10) 
        
        # write ranked results to file:
          orders <- order(pValues)
          ordered.data <- cbind(rownames(Matrix)[orders],Matrix[orders,],
                                pValues[orders])
          write(t(as.matrix(ordered.data)), ncolumns=1+ncol(Matrix)+1,
                file = "Pvalues.abs")
        
  11. The affy package for non-affy data: Enter the following commands:
        
          library(affy)
        # read file:
          dataf <- read.table("input.file", row.names=1, sep="t", 
                               header=F, comment.char = "")
        # normalize using qspline:
          normdata <- normalize.qspline(dataf, na.rm=TRUE)
        
        # t-test function:
           get.pval.ttest <- function(dataf,index1,index2,
                  datafilter=as.numeric){
                  f <- function(i) {
                    return(t.test(datafilter(dataf[i,index1]),
                      datafilter(dataf[i,index2]))$p.value)
                  }
                  return(sapply(1:length(dataf[,1]),f))
            }   
        
        # perform t-test:
          pValues <- get.pval.ttest(normdata,1:12,13:24) 
        
        # write ranked results to file:
          orders <- order(pValues)
          ordered.data <- cbind(rownames(dataf)[orders],normdata[orders,],
                                pValues[orders])
          write(t(as.matrix(ordered.data)), ncolumns=1+ncol(normdata)+1,
                file = "Pvalues.abs")
        
  12. Survival Analysis: Kaplan-Meier: Enter the following commands:
        # plot survival curves:
          library(survival)
          data(gehan)
          gehan.surv <- survfit(Surv(time, cens) ~ treat, data = gehan, conf.type = "log-log")
          plot(gehan.surv, ylab="Survival", xlab="Remission time (weeks)")
          text(30,0.5,pos=4,"6-MP"); text(23,0.05,pos=4, "control")
        
        # perform log-rank test for significance of difference between curves:
          survdiff(Surv(time, cens) ~ treat, data = gehan)
        
        
  13. Survival Analysis: Cox Proportional Hazards: Enter the following commands:
          fit <- coxph(Surv(time, cens) ~ strata(treat), data=gehan)
          plot(survfit(fit), ylab="Survival", xlab="Remission time (weeks)")
          text(17,0.7,pos=4, "6-MP"); text(17,0.2,pos=4, "control")
        
        

Medical Prognosis Institute ApS, Agern Alle 7, 2970 Hørsholm, Denmark
© Copyright 2006 MPI ApS