Dataset

This is a microarray analysis example where the dataset is divded into multiple grouping. Here, we manually define the groupings based on the CEL filenames: Note: ideally, all these information should be stored in a meta-data text file.

Our groupings includes; Ctrl, Ctrl_Trmt, Isch, Isch_Trmt, KO_Ctrl and KO_Isch All together: 6 groups.

#*########################
#*########################
### List data
#*## 
#*## 

Ctrl = c('./data/1C.CEL', './data/2C.CEL', './data/3C.CEL', './data/4C.CEL', './data/5C.CEL', './data/6C.CEL')
Ctrl.labels = c('C-1C', 'C-2C', 'C-3C', 'C-4C', 'C-5C', 'C-6C')

Ctrl_Trmt = c('./data/1E4.CEL', './data/2E4.CEL', './data/3E4.CEL', './data/4E4.CEL', './data/5E4.CEL', './data/6E4.CEL')
Ctrl_Trmt.labels = c('CT-1E4', 'CT-2E4', 'CT-3E4', 'CT-4E4', 'CT-5E4', 'CT-6E4')

Isch = c('./data/1IR.CEL', './data/2IR.CEL', './data/3IR.CEL', './data/4IR.CEL', './data/5IR.CEL', './data/6IR.CEL')
Isch.labels = c('I-1IR', 'I-2IR', 'I-3IR', 'I-4IR', 'I-5IR', 'I-6IR')

Isch_Trmt = c('./data/2EIR.CEL', './data/3EIR.CEL', './data/4EIR.CEL', './data/5EIR.CEL', './data/6EIR.CEL')
Isch_Trmt.labels = c('IT-2EIR', 'IT-3EIR', 'IT-4EIR', 'IT-5EIR', 'IT-6EIR')

KO_Ctrl = c('./data/N1.CEL', './data/N2.CEL', './data/N3.CEL')
KO_Ctrl.labels = c('KOC-N1', 'KOC-N2', 'KOC-N3')

KO_Isch = c('./data/N4.CEL', './data/N5.CEL', './data/N6.CEL', './data/N7.CEL')
KO_Isch.labels = c('KOI-N4', 'KOI-N5', 'KOI-N6', 'KOI-N7')

Import Data

We will use the oligo package to import the dataset

library(oligo)

#test = read.celfiles(filenames = './data/1C.CEL')
#test.exprs = exprs(test)

import.data = read.celfiles(filenames = c(Ctrl, Ctrl_Trmt, Isch, Isch_Trmt, KO_Ctrl, KO_Isch))
## Reading in : ./data/1C.CEL
## Reading in : ./data/2C.CEL
## Reading in : ./data/3C.CEL
## Reading in : ./data/4C.CEL
## Reading in : ./data/5C.CEL
## Reading in : ./data/6C.CEL
## Reading in : ./data/1E4.CEL
## Reading in : ./data/2E4.CEL
## Reading in : ./data/3E4.CEL
## Reading in : ./data/4E4.CEL
## Reading in : ./data/5E4.CEL
## Reading in : ./data/6E4.CEL
## Reading in : ./data/1IR.CEL
## Reading in : ./data/2IR.CEL
## Reading in : ./data/3IR.CEL
## Reading in : ./data/4IR.CEL
## Reading in : ./data/5IR.CEL
## Reading in : ./data/6IR.CEL
## Reading in : ./data/2EIR.CEL
## Reading in : ./data/3EIR.CEL
## Reading in : ./data/4EIR.CEL
## Reading in : ./data/5EIR.CEL
## Reading in : ./data/6EIR.CEL
## Reading in : ./data/N1.CEL
## Reading in : ./data/N2.CEL
## Reading in : ./data/N3.CEL
## Reading in : ./data/N4.CEL
## Reading in : ./data/N5.CEL
## Reading in : ./data/N6.CEL
## Reading in : ./data/N7.CEL
rma.data = rma(import.data)
## Background correcting
## Normalizing
## Calculating Expression
exprs.data = exprs(rma.data)

Avgdata = exprs.data
colnames(Avgdata) = c(Ctrl.labels, Ctrl_Trmt.labels, Isch.labels, Isch_Trmt.labels, KO_Ctrl.labels, KO_Isch.labels)

Principal Component Analysis (PCA)

We will use PCA to cluster the data

library(rgl)

pca = prcomp(t(Avgdata), scale = F)
cols = c(rep('brown2', 6), rep('brown4', 6), rep('cyan1', 6), rep('cyan4', 5), rep('darkolivegreen1', 3), rep('darkolivegreen4', 4))

plot3d(pca$x[,1],pca$x[,2],pca$x[,3], col = cols, size = 30)
text3d(pca$x[,1],pca$x[,2],pca$x[,3], texts=colnames(Avgdata), adj = 1.7)

writeWebGL(dir = './', filename = 'PCA.Almut.webGL.html')

Remove Outlier

Based on the PCA visualization, we have decided to remove some outlier samples

## Removed 4C, 6C, 3E4 and 4E4
Ctrl = c('./data/1C.CEL', 
         './data/2C.CEL', 
         './data/3C.CEL', 
         #'./data/4C.CEL', 
         './data/5C.CEL'
         #'./data/6C.CEL'
        )

Ctrl.labels = c('C-1C', 
                'C-2C', 
                'C-3C', 
                #'C-4C', 
                'C-5C' 
                #'C-6C'
              )

Ctrl_Trmt = c('./data/1E4.CEL', 
              './data/2E4.CEL', 
              #'./data/3E4.CEL', 
              #'./data/4E4.CEL', 
              './data/5E4.CEL', 
              './data/6E4.CEL'
            )
Ctrl_Trmt.labels = c('CT-1E4', 
                     'CT-2E4', 
                     #'CT-3E4', 
                     #'CT-4E4', 
                     'CT-5E4', 
                     'CT-6E4')


## Removed 1IR, 3IR and 5 IR
Isch = c(#'./data/1IR.CEL', 
         './data/2IR.CEL', 
         #'./data/3IR.CEL', 
         './data/4IR.CEL', 
         #'./data/5IR.CEL', 
         './data/6IR.CEL'
        )
Isch.labels = c(#'I-1IR', 
                'I-2IR', 
                #'I-3IR', 
                'I-4IR', 
                #'I-5IR', 
                'I-6IR'
              )

Isch_Trmt = c('./data/2EIR.CEL', 
              './data/3EIR.CEL', 
              './data/4EIR.CEL', 
              './data/5EIR.CEL', 
              './data/6EIR.CEL'
            )
Isch_Trmt.labels = c('IT-2EIR', 
                     'IT-3EIR', 
                     'IT-4EIR', 
                     'IT-5EIR', 
                     'IT-6EIR'
                    )

## Remove nothing
KO_Ctrl = c('./data/N1.CEL', './data/N2.CEL', './data/N3.CEL')
KO_Ctrl.labels = c('KOC-N1', 'KOC-N2', 'KOC-N3')

KO_Isch = c('./data/N4.CEL', './data/N5.CEL', './data/N6.CEL', './data/N7.CEL')
KO_Isch.labels = c('KOI-N4', 'KOI-N5', 'KOI-N6', 'KOI-N7')

T-test Anomony

I discovered t.test sometimes returns an error and halt operation. This is cause by t.test returning a series of similar intermediate results. See the post to learn more. Here are some code to get around it.

my.t.test.p.value <- function(...) { 
  obj<-try(tzu.t.test(...), silent=TRUE) 
  if (is(obj, "try-error")) return(NA) else return(obj) 
} 


# my.t.test.p.value(Avgdata[100,], agroup = agroup.factor, bgroup = bgroup.factor)
# my.t.test.p.value(Avgdata[24588,], agroup = agroup.factor, bgroup = bgroup.factor, var.equal = F)
# tzu.t.test(Avgdata[24588,], agroup = agroup.factor, bgroup = bgroup.factor, var.equal = F)
# my.t.test.p.value(Avgdata[39680,], agroup = agroup.factor, bgroup = bgroup.factor)

Define Comparison Combinations

Use the combn to set up all the comparison combinaitons

#*########################
#*########################
### Set up to do ALL comparison
#*## 
#*## 

ALL.comparison.CEL = list(
                  'Ctrl' = Ctrl,
                  'Ctrl_Trmt' = Ctrl_Trmt,
                  'Isch' = Isch,
                  'Isch_Trmt' = Isch_Trmt,
                  'KO_Ctrl' = KO_Ctrl,
                  'KO_Isch' = KO_Isch
                  )



ALL.comparisons = list(
                  'Ctrl' = Ctrl.labels,
                  'Ctrl_Trmt' = Ctrl_Trmt.labels,
                  'Isch' = Isch.labels,
                  'Isch_Trmt' = Isch_Trmt.labels,
                  'KO_Ctrl' = KO_Ctrl.labels,
                  'KO_Isch' = KO_Isch.labels
                  )



ALL.combinations = combn(names(ALL.comparisons), 2)

knitr::kable(as.data.frame(ALL.combinations))
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
Ctrl Ctrl Ctrl Ctrl Ctrl Ctrl_Trmt Ctrl_Trmt Ctrl_Trmt Ctrl_Trmt Isch Isch Isch Isch_Trmt Isch_Trmt KO_Ctrl
Ctrl_Trmt Isch Isch_Trmt KO_Ctrl KO_Isch Isch Isch_Trmt KO_Ctrl KO_Isch Isch_Trmt KO_Ctrl KO_Isch KO_Ctrl KO_Isch KO_Isch
ALL.combinations.names = rep(NA, dim(ALL.combinations)[2])
for(i in 1:dim(ALL.combinations)[2]){
  agroup.name = names(ALL.comparisons[ALL.combinations[1,i]])
  bgroup.name = names(ALL.comparisons[ALL.combinations[2,i]])
  comparison.name = paste(agroup.name, 'vs', bgroup.name, sep = '.')
  ALL.combinations.names[i] = comparison.name
}

(ALL.combinations.names)
##  [1] "Ctrl.vs.Ctrl_Trmt"      "Ctrl.vs.Isch"          
##  [3] "Ctrl.vs.Isch_Trmt"      "Ctrl.vs.KO_Ctrl"       
##  [5] "Ctrl.vs.KO_Isch"        "Ctrl_Trmt.vs.Isch"     
##  [7] "Ctrl_Trmt.vs.Isch_Trmt" "Ctrl_Trmt.vs.KO_Ctrl"  
##  [9] "Ctrl_Trmt.vs.KO_Isch"   "Isch.vs.Isch_Trmt"     
## [11] "Isch.vs.KO_Ctrl"        "Isch.vs.KO_Isch"       
## [13] "Isch_Trmt.vs.KO_Ctrl"   "Isch_Trmt.vs.KO_Isch"  
## [15] "KO_Ctrl.vs.KO_Isch"

Looping Analysis

Here is the loop. For demo purposes, I only ran one loop

Some important modification:

#*#############
#*########
## load required libraries

fileLoader<-function(sourcefiles) {
  
  pathdirectory<-'~/Dropbox/TzuRFunctions/'
  sourcefiles<-paste(pathdirectory,sourcefiles,sep="")
  
  for(i in 1:length(sourcefiles))
    source(sourcefiles[i])
  
}  

fileLoader(c('fdr.R', 'medianfilter.R', 'tzu.t.test.R', 'tzu.fold.change.R', 'tzu.t.stat.test.R'))    

library(annaffy)

#for(i in 1:dim(ALL.combinations)[2]){
for(i in 1){
  
  
  import.data = read.celfiles(filenames = unlist(c(ALL.comparison.CEL[ALL.combinations[1,i]], ALL.comparison.CEL[ALL.combinations[2,i]])))
  rma.data = rma(import.data)
  Avgdata = exprs(rma.data)
  
  colnames(Avgdata) = unlist(c(ALL.comparisons[ALL.combinations[1,i]], ALL.comparisons[ALL.combinations[2,i]]))
  

  agroup.index = unlist(ALL.comparisons[ALL.combinations[1,i]])
  agroup.name = names(ALL.comparisons[ALL.combinations[1,i]])
  cat('Do ', agroup.index, '\n')
  cat('vs.\n')
  bgroup.index = unlist(ALL.comparisons[ALL.combinations[2,i]])
  bgroup.name = names(ALL.comparisons[ALL.combinations[2,i]])
  cat('Do ', bgroup.index, '\n')
  comparison.name = paste(agroup.name, 'vs', bgroup.name, sep = '.')
  cat('Comparison name: ', comparison.name, '\n')
  cat('######## \n')
  
  
  agroup = 1:length(unlist(ALL.comparison.CEL[ALL.combinations[1,i]]))
  bgroup = (max(agroup)+1):(max(agroup)+length(unlist(ALL.comparison.CEL[ALL.combinations[2,i]])))
  
  #*#############
  #*########
  ## Perform Median Filter
  
  cat('Performing Median Filter on ', comparison.name, '..................\n')
  p = medianfilter(t(Avgdata), percent = 50)
  H = fdr(1-p, 0.1, 'stepup')
  where = H
  Avgdata = Avgdata[where,]
  
  cat('After remove low variance genes ... \n')
  cat(dim(Avgdata)[1], ' gene left \n')
  
  #*#############
  #*########
  ## Perform Statistical Analysis
  
  cat('Condition 1 is: \n')
  print(colnames(Avgdata[, agroup]))
  cat('Condition 2 is: \n')
  print(colnames(Avgdata[, bgroup]))
  
  cat('\n\n\n Running ', comparison.name, ' now ........................... \n\n')
  
  agroup.mean = apply(Avgdata[,agroup], 1, mean)
  bgroup.mean = apply(Avgdata[,bgroup], 1, mean)
  p<- apply(Avgdata, 1, tzu.t.test, agroup=agroup, bgroup=bgroup, var.equal = F)
  adjP = p.adjust(p, method = 'BH', n = length(p))
  fc<- apply(Avgdata, 1, tzu.fold.change, agroup = agroup, bgroup = bgroup, log.value = T)
  stat<- apply(Avgdata, 1, tzu.t.stat.test, agroup = agroup, bgroup = bgroup)
  
  final.table = c()
  
  anntable = aafTableAnn(rownames(Avgdata), 'mta10sttranscriptcluster.db')
  stat.table = aafTable('t-statistics' = stat, signed = T)
  final.table = merge(anntable, stat.table)
  p.table = aafTable('p-value' = p)
  final.table = merge(final.table, p.table)
  adjP.table = aafTable('Adjusted-p' = adjP)
  final.table = merge(final.table, adjP.table)
  fc.table = aafTable('foldChange' = fc, signed = T)
  final.table = merge(final.table, fc.table)
  agroup.table = aafTable('agroup-mean' = agroup.mean)
  final.table = merge(final.table, agroup.table)
  bgroup.table = aafTable('bgroup-mean' = bgroup.mean)
  final.table = merge(final.table, bgroup.table)
  
  ## Extract Raw data
  
  exprtable = aafTableInt(rma.data, probeids = rownames(Avgdata))
  
  final.table = merge(final.table, exprtable)
  
  saveText(final.table, filename = paste('./output/Almut', comparison.name, 'ALL.DATA.txt', sep = '.'))
  saveHTML(final.table, filename = paste('./output/Almut', comparison.name, 'ALL.DATA.html', sep = '.'))
  
  cat('######################################################\n')
  cat('######################################################\n')
  cat('######################################################\n')
  cat('######################################################\n')
}
## Reading in : ./data/1C.CEL
## Reading in : ./data/2C.CEL
## Reading in : ./data/3C.CEL
## Reading in : ./data/5C.CEL
## Reading in : ./data/1E4.CEL
## Reading in : ./data/2E4.CEL
## Reading in : ./data/5E4.CEL
## Reading in : ./data/6E4.CEL
## Background correcting
## Normalizing
## Calculating Expression
## Do  C-1C C-2C C-3C C-5C 
## vs.
## Do  CT-1E4 CT-2E4 CT-5E4 CT-6E4 
## Comparison name:  Ctrl.vs.Ctrl_Trmt 
## ######## 
## Performing Median Filter on  Ctrl.vs.Ctrl_Trmt ..................
## After remove low variance genes ... 
## 15499  gene left 
## Condition 1 is: 
## [1] "C-1C" "C-2C" "C-3C" "C-5C"
## Condition 2 is: 
## [1] "CT-1E4" "CT-2E4" "CT-5E4" "CT-6E4"
## 
## 
## 
##  Running  Ctrl.vs.Ctrl_Trmt  now ........................... 
## 
## ######################################################
## ######################################################
## ######################################################
## ######################################################