Idiopathic Pulmonary Arterial Hypertension Gene Expression Analysis

Report 1 - Dr. Robert Stearman
By: Tiffany Callahan

Background


Data and Experimental Design


Data

Experimental design

Read in descriptive data file and run summary statistics to examine the distribution of the data prior to importing CEL files

options(width = 500)
## Load EMA and oligo libraries
library(EMA)
library(oligo)

## Read in descriptive data file
PBHI.file.info = read.table(file = "PHBI cel annotation BIO6660 v2.txt", header = T, 
    sep = "\t")
head(PBHI.file.info)
##   PHBI.cel.file PHBI.number Clinical.Category Age  Race    Sex Scan.Date Batch
## 1  PHBI_001.cel    PHBI_001              IPAH  62 White Female   10/1/09     1
## 2  PHBI_002.cel    PHBI_002              IPAH  49 White Female   10/1/09     1
## 3  PHBI_003.cel    PHBI_003              IPAH  64 White   Male   10/1/09     1
## 4  PHBI_019.cel    PHBI_019      Failed Donor  60 White   Male   10/1/09     1
## 5  PHBI_027.cel    PHBI_027      Failed Donor  49 White   Male   10/1/09     1
## 6  PHBI_042.cel    PHBI_042              IPAH  44 White Female   10/1/10     2
## Extract groups: IPAH and Failed Donor
PBHI.type.cl = as.character(PBHI.file.info$Clinical.Category)
PBHI.type.cl
##  [1] "IPAH"         "IPAH"         "IPAH"         "Failed Donor" "Failed Donor" "IPAH"         "IPAH"         "Failed Donor" "Failed Donor" "IPAH"         "Failed Donor" "Failed Donor" "Failed Donor" "Failed Donor" "IPAH"         "IPAH"         "IPAH"
## Extract batch information
PBHI.batch.cl = PBHI.file.info$Batch
PBHI.batch.cl
##  [1] 1 1 1 1 1 2 2 2 2 2 2 3 3 3 3 3 3
## Run basic summary statistics
tapply(PBHI.file.info$Age, PBHI.file.info$Clinical.Category, mean)
## Failed Donor         IPAH 
##        39.88        54.67
tapply(PBHI.file.info$Sex, PBHI.file.info$Clinical.Category, summary)
## $`Failed Donor`
## Female   Male 
##      2      6 
## 
## $IPAH
## Female   Male 
##      5      4
## Visualize age distributions by group
library(ggplot2)

plot = ggplot(PBHI.file.info, aes(factor(PBHI.file.info$Clinical.Category), 
    PBHI.file.info$Age)) + geom_boxplot(aes(fill = factor(PBHI.file.info$Clinical.Category))) + 
    labs(title = "Age Distributions by PAH Group") + xlab("Group") + ylab("Age")
plot + scale_fill_manual("PAH Group", values = c("#56B4E9", "#009E73"))

plot of chunk expDesign

Preprocessing and Quality Control


Import the CEL files

Import CEL files, read in data, and normalize data using the RMA function

## Import CEL files
PBHI.CEL = PBHI.file.info$PBHI.CEL
PBHI.CEL = list.celfiles()

## Read CEL files to directory
PHBI.data = read.celfiles(PBHI.CEL)

## Normalize the data
PHBI.norm = rma(PHBI.data)

Before and After Data Normalization

Create boxplots and density plots of log-intentisy distributions to visulaize data pre and post normalization

## Load color libraries
library(RColorBrewer)

## Set color palette
color.palette = brewer.pal(8, "Set1")

## Pre-normalized intensity values boxplot
boxplot(PHBI.data, col = color.palette, main = "Pre-normalized Intensity Values")

plot of chunk normQC

## Normalized intensity values boxplot
boxplot(PHBI.norm, col = color.palette, main = "Normalized Intensity Values")

plot of chunk normQC

## Pre-normalized density plot of log-intensity distribution
hist(PHBI.data, col = color.palette, main = "Pre-Normalized Density Plot of log-Intensity Distribution")

plot of chunk normQC

## Normalized density plot of log-intensity distribution
hist(PHBI.norm, col = color.palette, main = "Normalized Density Plot of log-Intensity Distribution")

plot of chunk normQC

Filtering Process

Extract normalized expression values and perform filtering to discard probesets below a specified threshold

## Extract expression values after normalization, verify dimensions
PHBI.exprs = exprs(PHBI.norm)
dim(PHBI.exprs)
## [1] 33297    17
## Filter and discard probesets with a maximum log2 expression value below 4,
## p=0.01
PUBHI.f = expFilter(PHBI.exprs, threshold = 4)

plot of chunk clustering

## Keep probes with at least 1 sample(s) with an expression level higher than 4
## View data ditribution after filtering; remove threshold line
PUBHI.A = expFilter(PUBHI.f, threshold = F)

plot of chunk clustering

## Keep probes with at least 1 sample(s) with an expression level higher than FALSE
PUBHI.sample = clustering(data = PUBHI.f, metric = "pearson", method = "ward")

## Visualize data pre and post filtering
clustering.plot(tree = PUBHI.sample, lab = PBHI.type.cl, title = "Filtered Data")

plot of chunk clustering

clustering.plot(tree = PUBHI.sample, lab = PBHI.batch.cl, title = "Filtered PBHI Data by Batch")

plot of chunk clustering

Principal Component Analysis

Run Principle Component Analysis (PCA) on the normalized and filtered data

## Segment variation into different components
acp = runPCA(t(PUBHI.f), scale = FALSE, lab.sample = PBHI.type.cl, plotSample = FALSE, 
    plotInertia = FALSE)
plotInertia(acp)

plot of chunk dataFilter

## View pre-batch correction individual maps (axes 1 and 2) to look at the
## variation between groups and batches
plotSample(acp, axes = c(1, 2), lab = PBHI.type.cl)

plot of chunk dataFilter

plotSample(acp, axes = c(1, 2), lab = as.character(PBHI.batch.cl))

plot of chunk dataFilter

## Create pdf report of PCA with selected plots
acp = runPCA(t(PUBHI.f), scale = FALSE, pdfname = "PCA.pdf", lab.sample = PBHI.type.cl)

Batch Correction

Perform batch correction on the normalized data and re-run PCA to show post-batch correction improvements

# Load sva library
library(sva)

## Create experimental design object and perform batch correction
mod = model.matrix(~as.factor(PBHI.type.cl))
combat_edata = ComBat(dat = PUBHI.f, batch = PBHI.batch.cl, mod = mod, numCovs = NULL, 
    par.prior = TRUE, prior.plots = FALSE)
## Found 3 batches
## Found 1  categorical covariate(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data
## Run PCA after batch correction
acp = runPCA(t(combat_edata), scale = FALSE, lab.sample = PBHI.type.cl, plotSample = FALSE, 
    plotInertia = FALSE)
plotInertia(acp)

plot of chunk batchcorrection

## View post-batch correction individual maps (axes 1 and 2) to look at the
## variation between groups and batches
plotSample(acp, axes = c(1, 2), lab = PBHI.type.cl)

plot of chunk batchcorrection

plotSample(acp, axes = c(1, 2), lab = PBHI.batch.cl)

plot of chunk batchcorrection

## Sample Hierarchical Clustering (post-batch adjustment)
PUBHI.sample2 = clustering(data = combat_edata, metric = "pearson", method = "ward")
clustering.plot(tree = PUBHI.sample2, lab = PBHI.type.cl, title = "PHBI Data - Filtered")

plot of chunk batchcorrection

clustering.plot(tree = PUBHI.sample2, lab = PBHI.batch.cl, title = "PHBI Data By Batch - Filtered")

plot of chunk batchcorrection

Statistical Analysis


Perform statistical analysis

Run Student's t-test and Significance of Analysis of Microarrays (SAM) test on normalized data. Merge the test results into one data set to produce a heatmap

## Set-up Student's t-test comparison factor
PUBHI.type.num = ifelse(PBHI.type.cl == "IPAH", 0, 1)

## Run Student's t-test with batch corrected data
PUBHI.ttest = runTtest(combat_edata, labels = PUBHI.type.num, algo = "t.equalvar", 
    q = 0.2, plot = FALSE)
## [1] "typeFDR= FDR-BH"
head(PUBHI.ttest)
##   probeID     Stat RawpValue AdjpValue
## 1 7892501 -0.89863  0.383047   0.74246
## 2 7892502  3.88407  0.001468   0.08448
## 3 7892503  0.09457  0.925909   0.98006
## 4 7892504 -0.54918  0.590965   0.85741
## 5 7892505  0.12553  0.901772   0.97300
## 6 7892506 -0.74034  0.470525   0.79586
## Run the SAM test with batch corrected data
PUBHI.SAM = runSAM(PUBHI.f, labels = PUBHI.type.num)
## 
## We're doing 24310 complete permutations
## and randomly select 500 of them.
## 
##    Delta p0 False Called     FDR
## 1   0.10  1 820.5   1473 0.55703
## 2   0.15  1 458.5    988 0.46407
## 3   0.20  1 253.0    677 0.37371
## 4   0.25  1 154.0    513 0.30019
## 5   0.30  1  92.5    381 0.24278
## 6   0.35  1  61.0    301 0.20266
## 7   0.40  1  38.0    229 0.16594
## 8   0.45  1  22.0    163 0.13497
## 9   0.50  1  16.0    140 0.11429
## 10  0.55  1   9.0    104 0.08654
## 11  0.60  1   5.0     80 0.06250
## 12  0.65  1   4.0     67 0.05970
## 13  0.70  1   3.0     52 0.05769
## 14  0.75  1   1.0     43 0.02326
## 15  0.80  1   1.0     42 0.02381
## 16  0.85  1   1.0     38 0.02632
## The threshold seems to be at 
##    Delta Called     FDR
## 5 0.6535     67 0.05970
## 6 0.6535     63 0.04762
## [1] "Delta : 0.653531"

plot of chunk Analysis

## [1] "Find 63 significant genes ..."
head(PUBHI.SAM)
##         probeID     Stat RawpValue FoldChange Significant
## 7892501 7892501 -0.25801   0.70422     0.9007       FALSE
## 7892502 7892502  1.41878   0.04811     1.2360       FALSE
## 7892503 7892503  0.05479   0.93553     1.0115       FALSE
## 7892504 7892504 -0.16859   0.80385     0.9657       FALSE
## 7892505 7892505 -0.11046   0.87058     0.9753       FALSE
## 7892506 7892506 -0.62306   0.36446     0.8566       FALSE
## Verify data set dimensions
dim(PUBHI.ttest)
## [1] 28010     4
dim(PUBHI.SAM)
## [1] 28010     5
## Sort genes in both datasets by the 'probeID' variable
PUBHI.ttest.sig = PUBHI.ttest[order(PUBHI.ttest$probeID), ]
PUBHI.SAM.sig = PUBHI.SAM[order(PUBHI.SAM$probeID), ]

## Extract the 'probeID' and 'FoldChange' variables from the PBHI.SAM.sig
## data set
FC.raw = PUBHI.SAM.sig[, c("probeID", "FoldChange")]

## Merge the 'FoldChange' Variable into PUBHI.ttest.sig database by probeID
PUBHI.test.merge = merge(PUBHI.ttest.sig, FC.raw, by = "probeID", all.y = TRUE, 
    all.x = TRUE)
head(PUBHI.test.merge)
##   probeID     Stat RawpValue AdjpValue FoldChange
## 1 7892501 -0.89863  0.383047   0.74246     0.9007
## 2 7892502  3.88407  0.001468   0.08448     1.2360
## 3 7892503  0.09457  0.925909   0.98006     1.0115
## 4 7892504 -0.54918  0.590965   0.85741     0.9657
## 5 7892505  0.12553  0.901772   0.97300     0.9753
## 6 7892506 -0.74034  0.470525   0.79586     0.8566
## Re-sort merged data by the 'AdjpValue' variable
PUBHI.test.merge = PUBHI.test.merge[order(PUBHI.test.merge$AdjpValue), ]
head(PUBHI.test.merge)
##       probeID   Stat RawpValue AdjpValue FoldChange
## 5913  7922174  8.692 3.056e-07  0.006097     1.5718
## 5973  7922793  8.181 6.530e-07  0.006097     1.2189
## 26312 8163908 -8.285 5.578e-07  0.006097     0.5023
## 10129 7973110  7.784 1.204e-06  0.008429     4.4228
## 8924  7958174  7.403 2.208e-06  0.008835     1.7526
## 15130 8031207  7.464 2.000e-06  0.008835     2.4424
## Produce a volcano Plot to verify SAM findings
volcano = ggplot(data = PUBHI.test.merge, aes(x = log2(PUBHI.test.merge$FoldChange), 
    y = -log10(PUBHI.test.merge$AdjpValue)), colour = none) + geom_point(alpha = 0.4, 
    size = 1.75) + labs(title = "PAH Volcano Plot") + xlim(c(-2, 2)) + ylim(c(0, 
    2)) + xlab("Log2 Fold Change") + ylab("-Log10 P-Values")
volcano

plot of chunk Volcano

Output Analysis Result

Produce a heatmap of the top 100 significant genes

## Produce Heatmap
mvgenes = as.character(PUBHI.test.merge$probeID[1:100])
c.sample <- clustering(data = PUBHI.f[mvgenes, ], metric = "pearson", method = "ward")
c.gene <- clustering(data = t(PUBHI.f[mvgenes, ]), metric = "pearson", method = "ward")
clustering.plot(tree = c.sample, tree.sup = c.gene, data = PUBHI.f[mvgenes, 
    ], names.sup = FALSE, lab = PBHI.type.cl, trim.heatmap = 0.99)

plot of chunk heatmap

Annotate Signifigant Genes

Annotate the top 250 significant genes, output to a text file. Input cluster analysis output from DAVID and print the first three rows

options(width = 500)
## Load annaffy and hugene10sttranscriptcluster.db libraries
library(annaffy)
library(hugene10sttranscriptcluster.db)

## Annotate the top 250 significant genes
anntable = aafTableAnn(as.character(PUBHI.test.merge$probeID[1:250]), "hugene10sttranscriptcluster.db")

## Add the 'AdjpValue' and 'FoldChange' variables to the annotation table
atable = aafTable(`P-Value` = PUBHI.test.merge$AdjpValue[1:250], signed = TRUE)
FCtable = aafTable(`Fold Change` = PUBHI.test.merge$FoldChange[1:250], signed = TRUE)
table = merge(anntable, atable)
table2 = merge(table, FCtable)

## Export results to an HTML and text file
saveHTML(table2, file = "PBHI.psig.genes.htm")
saveText(table2, file = "PBHI.psig.genes.txt")

## Print top 7 significant genes at p ≤ 0.01 level
annot.output = read.table(file = "annot.sig.txt", header = T, sep = "\t", nrows = 7)
annot.outputs = annot.output[order(annot.output$P.Value), ]
annot.outputs
##     Probe Symbol                                                                    Description Chromosome  P.Value Fold.Change
## 1 7922793  ARPC5                            actin related protein 2/3 complex, subunit 5, 16kDa          1 0.006097      1.2189
## 2 7922174     F5                             coagulation factor V (proaccelerin, labile factor)          1 0.006097      1.5718
## 3 8163908 GGTA1P                         glycoprotein, alpha-galactosyltransferase 1 pseudogene          9 0.006097      0.5023
## 4 7973110 RNASE2         ribonuclease, RNase A family, 2 (liver, eosinophil-derived neurotoxin)         14 0.008429      4.4228
## 5 8075423 DUSP18                                                dual specificity phosphatase 18         22 0.008835      1.3841
## 6 8031207 LILRA2 leukocyte immunoglobulin-like receptor, subfamily A (with TM domain), member 2         19 0.008835      2.4424
## 7 7958174 TXNRD1                                                        thioredoxin reductase 1         12 0.008835      1.7526
## Read in DAVID cluster analysis output file and print the first 3 rows
DAVID = read.table(file = "DAVID Results_3.01.14.txt", header = T, sep = "\t", 
    nrows = 3)
DAVID
##         Term              Function Count   PValue Fold.Enrichment Bonferroni Benjamini      FDR
## 1 GO:0006952      Defense response    25 1.01e-07           3.571   0.000127  0.000127 0.000164
## 2 GO:0006954 Inflammatory response    17 8.31e-07           4.595   0.001041  0.000521 0.001351
## 3 GO:0009611  Response to wounding    18 1.03e-04           2.983   0.121567  0.042285 0.167994
## Document the R session information
sessionInfo()
## R version 3.0.2 (2013-09-25)
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] sva_3.8.0                            RColorBrewer_1.0-5                   pd.hugene.1.0.st.v1_3.8.0            oligo_1.26.6                         oligoClasses_1.24.0                  mgcv_1.7-28                          nlme_3.1-115                         hugene10sttranscriptcluster.db_8.0.1 org.Hs.eg.db_2.10.1                  ggplot2_0.9.3.1                      EMA_1.4.3                            corpcor_1.6.6                        Biostrings_2.30.1                   
## [14] XVector_0.2.0                        IRanges_1.20.7                       annaffy_1.34.0                       KEGG.db_2.10.1                       GO.db_2.10.1                         RSQLite_0.11.4                       DBI_0.2-7                            AnnotationDbi_1.24.0                 Biobase_2.22.0                       BiocGenerics_0.8.0                   knitr_1.5                           
## 
## loaded via a namespace (and not attached):
##  [1] affxparser_1.34.2     affy_1.40.0           affyio_1.30.0         BiocInstaller_1.12.0  biomaRt_2.18.0        bit_1.1-11            cluster_1.15.0        codetools_0.2-8       colorspace_1.2-4      dichromat_2.0-0       digest_0.6.4          evaluate_0.5.1        FactoMineR_1.25       ff_2.2-12             foreach_1.4.1         formatR_0.10          gcrma_2.34.0          GenomicRanges_1.14.4  grid_3.0.2            GSA_1.03              gtable_0.1.2          heatmap.plus_1.3     
## [23] iterators_1.0.6       labeling_0.2          lattice_0.20-27       MASS_7.3-30           Matrix_1.1-2-2        multtest_2.18.0       munsell_0.4.2         plyr_1.8.1            preprocessCore_1.24.0 proto_0.3-10          Rcpp_0.11.0           RCurl_1.95-4.1        reshape2_1.2.2        scales_0.2.3          siggenes_1.36.0       splines_3.0.2         stats4_3.0.2          stringr_0.6.2         survival_2.37-7       tools_3.0.2           XML_3.95-0.2          xtable_1.7-3         
## [45] zlibbioc_1.8.0

Discussion and Conclusion