This is the official web companion for the following books:
A Biologist's Guide to Analysis of DNA Microarray Data (Knudsen, S.; Wiley, 2002)
Guide to Analysis of DNA Microarray Data (Knudsen, S.; Wiley, 2004)
Cancer Diagnostics with DNA Microarrays (Knudsen, S.; Wiley, 2006)
R: Public domain data analysis package available for download
for a number of different computer platforms at
www.r-project.org. There are several online manuals available:
for example
An Introduction to R (PDF file).
R code for cut-and-paste of examples listed in the books:
Example
- Create data file with example in Table 10.1 (12.2). On a Unix/Linux system you would
do it like this, followed by a Control-d character:
cat >example
Pat_N1 Pat_N2 Pat_A1 Pat_A2 Pat_B1 Pat_B2
gene_a 90 110 190 210 290 310
gene_b 190 210 390 410 590 610
gene_c 90 110 110 90 120 80
gene_d 200 100 400 90 600 200
- Start R (examples below have been tested with version 1.3 and 1.4)
by entering the letter R at the prompt.
- t-Test: Enter the following commands:
dataf <- read.table("example")
# load library for t-test:
library(ctest)
# t-test function:
get.pval.ttest <- function(dataf,index1,index2,
datafilter=as.numeric){
f <- function(i) {
return(t.test(datafilter(dataf[i,index1]),
datafilter(dataf[i,index2]))$p.value)
}
return(sapply(1:length(dataf[,1]),f))
}
# call function with our data:
pVal.ttest <- get.pval.ttest(dataf,3:4,5:6)
# print results on screen (only for a small dataset like this)
print(cbind(dataf,pVal.ttest))
Pat.N1 Pat.N2 Pat.A1 Pat.A2 Pat.B1 Pat.B2 pVal.ttest
gene_a 90 110 190 210 290 310 0.01941932
gene_b 190 210 390 410 590 610 0.00496281
gene_c 90 110 110 90 120 80 1.00000000
gene_d 200 100 400 90 600 200 0.60590011
# sort on P-value and write to file:
orders <- order(pVal.ttest)
ordered.data <- cbind(dataf[orders,],pVal.ttest[orders])
write(t(as.matrix(ordered.data)),
ncolumns=length(dataf)+1,
file = "ttest.out")
- Wilcoxon: Enter the following commands:
dataf <- read.table("example")
# load library for t-test:
library(ctest)
# t-test function:
get.pval.wilcox <- function(dataf,index1,index2,
datafilter=as.numeric){
f <- function(i) {
return(wilcox.test(datafilter(dataf[i,index1]),
datafilter(dataf[i,index2]))$p.value)
}
return(sapply(1:length(dataf[,1]),f))
}
# call function with our data:
pVal.wilcox <- get.pval.wilcox(dataf,3:4,5:6)
# print results on screen (only for a small dataset like this)
print(cbind(dataf,pVal.wilcox))
Pat.N1 Pat.N2 Pat.A1 Pat.A2 Pat.B1 Pat.B2 pVal.wilcox
gene_a 90 110 190 210 290 310 0.3333333
gene_b 190 210 390 410 590 610 0.3333333
gene_c 90 110 110 90 120 80 1.0000000
gene_d 200 100 400 90 600 200 0.6666667
# sort on P-value and write to file:
orders <- order(pVal.ttest)
ordered.data <- cbind(dataf[orders,],pVal.ttest[orders])
write(t(as.matrix(ordered.data)),
ncolumns=length(dataf)+1,
file = "wilcox.out")
- ANOVA: Enter the following commands:
dataf <- read.table("example")
Categories <- as.factor(c("N","N","A","A","B","B"))
indexAvgDiff <- 1:6
# ANOVA function by Laurent:
get.pval.anova <-
function(dataf,indexAll,templateCategories,datafilter=as.numeric){
templateCategories <- as.factor(templateCategories)
f <- function(i) {
return(summary(
aov(datafilter(dataf[i,indexAll]) ~ templateCategories)
)
[[1]][4:5][[2]][1])
}
return(sapply(1:length(dataf[,1]),f))
}
# call the function, sort and write results:
pVal.anova <- get.pval.anova(dataf,indexAvgDiff,Categories)
print(cbind(dataf,pVal.anova))
Pat.N1 Pat.N2 Pat.A1 Pat.A2 Pat.B1 Pat.B2 pVal.anova
gene_a 90 110 190 210 290 310 0.0017965439
gene_b 190 210 390 410 590 610 0.0002283540
gene_c 90 110 110 90 120 80 1.0000000000
gene_d 200 100 400 90 600 200 0.5560965577
orders <- order(pVal.anova)
ordered.data <- cbind(dataf[orders,],pVal.anova[orders])
write(t(as.matrix(ordered.data)),ncolumns=length(dataf)+1,file = "ANOVA.out")
- PCA: Enter the following commands:
library(mva)
dataf <- read.table("example")
pca <- princomp(dataf,cor=TRUE,scores=TRUE)
plot(pca)
biplot(pca)
- Correspondence Analysis: Enter the following commands:
library(MASS)
library(mva)
dataf <- read.table("example")
cs <- corresp(dataf,nf=2)
plot(cs)
- Classification: Download
the public SBRCT
dataset by Khan et al (2001). Enter the following commands:
library(class)
library(mva)
dataf <- read.table("2308genes63.txt",header=T)
tposed <- t(dataf)
knn.targets <- factor( c(rep("E", 23), rep("B", 8), rep("N", 12), rep("R", 20)) )
knn.cv(tposed, knn.targets, k=3, prob=TRUE)
E E E E E E E E E E E E E E E E E E E E E E E E
B B B B B B B B N N N N N N N N N N N N R R R N
R R R R R E R R R R R R R R R R
- Neural networks: Enter the following commands:
library(nnet)
tposed <- t(dataf)
pca <- princomp(tposed,cor=TRUE,scores=TRUE)
canc <- pca$scores[,1:10]
targets <- class.ind( c(rep("E", 23), rep("B", 8), rep("N", 12), rep("R", 20)) )
f <- function(samp) {
b <- c(0,0,0,0)
for(i in 1:20) {
trainednet <- nnet(canc[-samp,], targets[-samp,],
size=2,skip=FALSE,trace=FALSE, maxit=300)
a <- max.col(predict.nnet(trainednet, canc[samp,]))
b[a] <- b[a] + 1
}
print(b)
}
lapply(1:63,f)
- The affy package:
Download sample raw CEL files, e.g. GSE3212
From GEO.
# use version 1.2.30 or higher of the affy package:
library(affy)
# read CEL files:
data <- read.affybatch("GSM71942.CEL.gz","GSM71991.CEL.gz",
"GSM72243.CEL.gz","GSM72244.CEL.gz","GSM72245.CEL.gz",
"GSM72246.CEL.gz","GSM72247.CEL.gz","GSM72248.CEL.gz",
"GSM72249.CEL.gz","GSM72250.CEL.gz")
# background correct, normalize, and condense:
affy.es <- expresso(data, bgcorrect.method="rma2",
normalize.method = "qspline",
pmcorrect.method="pmonly",
summary.method ="liwong")
Matrix <- exprs(affy.es)
# t-test function:
get.pval.ttest <- function(dataf,index1,index2,
datafilter=as.numeric){
f <- function(i) {
return(t.test(datafilter(dataf[i,index1]),
datafilter(dataf[i,index2]))$p.value)
}
return(sapply(1:length(dataf[,1]),f))
}
# perform t-test:
pValues <- get.pval.ttest(Matrix,1:5,6:10)
# write ranked results to file:
orders <- order(pValues)
ordered.data <- cbind(rownames(Matrix)[orders],Matrix[orders,],
pValues[orders])
write(t(as.matrix(ordered.data)), ncolumns=1+ncol(Matrix)+1,
file = "Pvalues.abs")
- The affy package for non-affy data: Enter the following commands:
library(affy)
# read file:
dataf <- read.table("input.file", row.names=1, sep="t",
header=F, comment.char = "")
# normalize using qspline:
normdata <- normalize.qspline(dataf, na.rm=TRUE)
# t-test function:
get.pval.ttest <- function(dataf,index1,index2,
datafilter=as.numeric){
f <- function(i) {
return(t.test(datafilter(dataf[i,index1]),
datafilter(dataf[i,index2]))$p.value)
}
return(sapply(1:length(dataf[,1]),f))
}
# perform t-test:
pValues <- get.pval.ttest(normdata,1:12,13:24)
# write ranked results to file:
orders <- order(pValues)
ordered.data <- cbind(rownames(dataf)[orders],normdata[orders,],
pValues[orders])
write(t(as.matrix(ordered.data)), ncolumns=1+ncol(normdata)+1,
file = "Pvalues.abs")
- Survival Analysis: Kaplan-Meier: Enter the following commands:
# plot survival curves:
library(survival)
data(gehan)
gehan.surv <- survfit(Surv(time, cens) ~ treat, data = gehan, conf.type = "log-log")
plot(gehan.surv, ylab="Survival", xlab="Remission time (weeks)")
text(30,0.5,pos=4,"6-MP"); text(23,0.05,pos=4, "control")
# perform log-rank test for significance of difference between curves:
survdiff(Surv(time, cens) ~ treat, data = gehan)
- Survival Analysis: Cox Proportional Hazards: Enter the following commands:
fit <- coxph(Surv(time, cens) ~ strata(treat), data=gehan)
plot(survfit(fit), ylab="Survival", xlab="Remission time (weeks)")
text(17,0.7,pos=4, "6-MP"); text(17,0.2,pos=4, "control")
|