My R Frequently Used Scripts

Importing and Exporting NGS elements

read.fastq( )

Subsetting column based on factors from other column

data[which(data$other_column == "myChoice"), "myColumn"]

Storing Annotation in TranscriptDb Object

txdb = makeTxDbFromUCSC(
  genome="hg19",
  tablename="knownGene")

eByg = exonsBy(txdb, by = 'gene')

For Looping a series of values

seq(along=my.var)

Annotating IDs

Bioconductor’s annotation packages help with mapping various ID schemes to each other. We load the AnnotationDbi package and the annotation package org.Hs.eg.db:

library("AnnotationDbi") 
library("org.Hs.eg.db")

This is the organism annotation package (“org”) for Homo sapiens (“Hs”), organized as an AnnotationDbi database package (“db”), using Entrez Gene IDs (“eg”) as primary key. To get a list of all available key types, use:

columns(org.Hs.eg.db) 

## [1] "ENTREZID" "PFAM" "IPI" "PROSITE" "ACCNUM" "ALIAS"
## [7] "CHR" "CHRLOC" "CHRLOCEND" "ENZYME" "MAP" "PATH"
## [13] "PMID" "REFSEQ" "SYMBOL" "UNIGENE" "ENSEMBL" "ENSEMBLPROT"
## [19] "ENSEMBLTRANS" "GENENAME" "UNIPROT" "GO" "EVIDENCE" "ONTOLOGY"
## [25] "GOALL" "EVIDENCEALL" "ONTOLOGYALL" "OMIM" "UCSCKG"

We can use the mapIds function to add individual columns to our results table. We provide the row names of our results table as a key, and specify that keytype=ENSEMBL. The column argument tells the mapIds function which infor- mation we want, and the multiVals argument tells the function what to do if there are multiple possible values for a single input value. Here we ask to just give us back the first one that occurs in the database. To add the gene symbol and Entrez ID, we call mapIds twice.

res$symbol <- mapIds(org.Hs.eg.db, 
        \t\tkeys=row.names(res),
        column="SYMBOL", 
        keytype="ENSEMBL", 
        multiVals="first")

res$entrez <- mapIds(org.Hs.eg.db, 
        keys=row.names(res),
        column="ENTREZID", 
        keytype="ENSEMBL", 
        multiVals="first"

Simple RNAseq Quantitation

import libraries

library(GenomicFeatures)

Store Annotation in TranscriptDb


txdb = makeTxDbFromUCSC(
  genome = 'hg19',
  tablename = 'knownGene'
  
)

eByg = exonsBy(txdb, by = 'gene')


Sample organization


## Patient Data
##

patient.old = './mid_output/n4_BAMs/aligned_125bp/FF8_KCT_RNA_GCCAAT_L008Aligned.sortedByCoord.out.bam'
patient.new = './mid_output/n4_BAMs/aligned_150bp/F5FMX-KCT_S5_L008Aligned.sortedByCoord.out.bam'

## Gene Corrected
##

corrected.old = './mid_output/n4_BAMs/aligned_125bp/FF8_605_KCT_CTTGTA_L008Aligned.sortedByCoord.out.bam'
corrected.new = './mid_output/n4_BAMs/aligned_150bp/F5C7-21-KCT_S6_L008Aligned.sortedByCoord.out.bam'

RNAseq.samples = c(
  patient.old,
  patient.new,
  corrected.old,
  corrected.new
)

Quantitation

library(BiocParallel)

register(SerialParam())
register(MulticoreParam(workers = 10))

library(GenomicAlignments)

filenames = RNAseq.samples
bamfiles = BamFileList(filenames, yieldSize = 2000000)

if(file.exists('se.Rdata')){
  ## Load the file
  load('se.Rdata')
}else{
  ## Wow, this takes a long long long time ... 
  ptm = proc.time()
  se = summarizeOverlaps(features = eByg, reads = bamfiles,
                         mode='Union',
                         singleEnd=F,
                         ignore.strand = T,
                         fragments = T)
  time.ellapsed = proc.time() - ptm
  cat('Ellapsed time is ' , time.ellapsed[3]/60, ' minutes\n\n')
  
  ## Save a copy on disk
  save(se, file = 'se.Rdata')
}

tmp.count = assay(se)

sample.names = c('FF8_KCT','F5FMX-KCT','FF8_605','F5C7-21')

colnames(tmp.count) = sample.names


Log transformation


library(DESeq2)


log.count = rlog(tmp.count)
rownames(log.count) = rownames(tmp.count)


Generating GSEA format

Import file


rlog.data = read.csv(file = 'Senthilnath.logCount.csv')

all.data = rlog.data[, 2:10]
rownames(all.data) = rlog.data$X

all.entrez = as.character(rlog.data$X)

Convert to Gene Symbol and Description


library(AnnotationDbi)
library(org.Hs.eg.db)

all.symbol = mapIds(org.Hs.eg.db, keys = all.entrez, column = 'SYMBOL', keytype = 'ENTREZID', multiVals = 'first')


all.desc = mapIds(org.Hs.eg.db, keys = all.entrez, column = 'GENENAME', keytype = 'ENTREZID', multiVals = 'first')

Now, generate the GSEA files

Generate the Expression file


user = 'Senthilnath'

#######################
##############
#####
## Need to be changed
####
##############
#######################
var1 = '123'
var2 = '456'
var.subset = c(1,2,3,4,5,6)
#######################
var1 = '123'
var2 = '789'
var.subset = c(1,2,3,7,8,9)
#######################
var1 = '456'
var2 = '789'
var.subset = c(4,5,6,7,8,9)
#######################


tmp.data = all.data[ ,var.subset]
all.zero = rowSums(tmp.data)

tmp.data = as.matrix(cbind('Symbols' = as.character(all.symbol), 'Description' = as.character(all.desc), tmp.data))

## Need to remove all zeros
tmp.data = tmp.data[all.zero != 0, ]

version.string = '#1.2'
num.row = dim(tmp.data)[1]
num.col = dim(tmp.data)[2]-2

gsea.filename = paste(user, var1, 'vs', var2, 'GSEA.expression.gct', sep = '.')


## Let's get started ...
cat('', file = gsea.filename)
cat('#2.0', file = gsea.filename, append = T)  ## Line 1
cat('\n', file = gsea.filename, sep = '\t', append = T)
cat(c(num.row, num.col), file = gsea.filename, sep = '\t', append = T) ## Line 2
cat('\n', file = gsea.filename, sep = '\t', append = T)
cat(colnames(tmp.data), file = gsea.filename, sep = '\t', append = T)  ## Header Line
cat('\n', file = gsea.filename, sep = '\t', append = T)
## The rest of the data lines
for(i in 1:dim(tmp.data)[1]){
  
  cat(tmp.data[i,], file = gsea.filename, sep = '\t', append = T)
  cat('\n', file = gsea.filename, sep = '\t', append = T)
  
}


Generate the phenotype file



num.sample = num.col
num.comparison = 2

gsea.cls.filename = paste(user, var1, 'vs', var2, 'pheno.cls', sep = '.')

cat('', file = gsea.cls.filename)
cat(c(num.sample, num.comparison, '1'), file = gsea.cls.filename, sep = '\t', append = T) ## Line 1
cat('\n', file = gsea.cls.filename, sep = '\t', append = T)
cat(c('#', var1, var2), file = gsea.cls.filename, sep = '\t', append = T)  ## Line 2
cat('\n', file = gsea.cls.filename, sep = '\t', append = T)
cat(c(rep(var1, 3), rep(var2, 3)), file = gsea.cls.filename, sep = '\t', append = T)  ## Line 3
cat('\n', file = gsea.cls.filename, sep = '\t', append = T)