Computational Genomics Lab at IICB: April 2017

I have discussed about my speed genome analysis on my previous blog, now am writing the steps how to do that

1. calculate the intergenic distance of ur organism "x" from gtf file using the perl script , I have used the augustus predicted gtf file , the file needs to be modified according to the perl script so that it looks for the particular feature and pattern such as it looks for gene/exon/mRNA feature

sample of the augustus gtf file
(make sure the last column is mentioned in the gene row)

scaffold_1904 AUGUSTUS gene 1 775 0.09 + . transcript_id "g18705"; gene_id "temp";
scaffold_1 AUGUSTUS transcript 1 4141 0.05 + . g1.t1
scaffold_1 AUGUSTUS intron 1 180 0.55 + . transcript_id "g1.t1"; gene_id "g1";
scaffold_1 AUGUSTUS intron 2022 3511 0.21 + . transcript_id "g1.t1"; gene_id "g1";
scaffold_1 AUGUSTUS CDS 181 2021 0.09 + 2 transcript_id "g1.t1"; gene_id "g1";
scaffold_1 AUGUSTUS CDS 3512 4141 0.49 + 0 transcript_id "g1.t1"; gene_id "g1";
scaffold_1 AUGUSTUS stop_codon 4139 4141 . + 0 transcript_id "g1.t1"; gene_id "g1";

use the perl script Calculate_FIR_length.pl using the gtf file to calculate the intergenic distance between the features, already complimentbed is there but that is not the one which we need, that is for difference purpose when I posted in a github forum to author I came to know the difference, this is the link https://github.com/Adamtaranto/density-Mapr/issues/1#issuecomment-291475238

the intergenic distance between the gene file looks like this

"geneid","strand","fiveprime","threeprime"

"g10","+",5020,927

"g84","+",3316,8625

"g42","+",1558,1773

"g156","+",4460,13837

"g93","-",2035,553

"g30","+",117,361

"g106","+",1656,874

"g39","+",1380,720

"g1","+",NA,1222

"g70","+",2614,419

2. make a gtf file of Avh's with the location and calculate the intergenic distance for Avh's , intergenic 5 prime and 3 prime distance needs to be calculated the intergenic distance of effector file should look like this

"geneid","strand","fiveprime","threeprime"

"Avh92","-",4946,80942

"Avh48","+",137224,80942

"Avh102","+",38474,24067

"Avh127","-",137224,6955

"Avh304","-",7882,12043

"Avh313","+",23825,26166

"Avh91","-",24826,4946

"Avh303","+",16698,12043

"Avh34","-",41377,26166

"Avh61","+",4031,5317

"Avh311","-",14467,1021

"Avh310","+",4389,1021

"Avh93","+",41377,38474

3. then just Run the R script which is pasted below just by changing the names of the file, if the points or plots are going out the cut off please change the bin size and num of bins before its creating the heatmap don't change after the heatmap is made

whole_intergene=read.csv(file="intergene_whole.csv",sep=",")

NumBins=50

if ((max(whole_intergene$fiveprime, na.rm=TRUE)>max(whole_intergene$threeprime, na.rm=TRUE)) == TRUE) { whole_intergene2Bin=whole_intergene$fiveprime} else { whole_intergene2Bin=whole_intergene$threeprime}

whole_intergene2Bin=whole_intergene2Bin[which(whole_intergene2Bin!=0)]

whole_intergene2Bin=na.omit(whole_intergene2Bin)

BinSteps=round(length(whole_intergene2Bin)/ (NumBins-20) , digits=10)

whole_intergene2BinOrd=sort(whole_intergene2Bin)

#### The [2*BinSteps] has been changed to [1*Binsteps it was producing an error after googling the error has been fixed]

TempBinLimits=whole_intergene2BinOrd[seq(whole_intergene2BinOrd[2*BinSteps],length(whole_intergene2BinOrd),BinSteps)]

TempBinLimits[length(TempBinLimits)+1]=max(whole_intergene2Bin, na.rm=TRUE)

x<-seq(length(TempBinLimits))

fit<-nls(log(TempBinLimits) ~ a*x + b, start= c(a=0, b=0),algorithm='port',weights=((x-1.0* NumBins)^2))

pred=predict(fit, x)

BinLimits=c(1, round(exp(pred),0), max(whole_intergene2Bin))

xbin=cut(whole_intergene$fiveprime, breaks=c(BinLimits))

ybin=cut(whole_intergene$threeprime, breaks=c(BinLimits))

whole_intergene=cbind(whole_intergene, xbin, ybin, genevalue=rep(1, length (whole_intergene$fiveprime)))

GenValMatrix<-with(whole_intergene, tapply (genevalue, list(xbin, ybin), sum))

x<-1:ncol(GenValMatrix)

y<-1:nrow(GenValMatrix)

zlim = range(as.numeric (unlist(GenValMatrix)) , finite=TRUE)

mypalette<-colorRampPalette(c( "white","darkblue", "forestgreen", "goldenrod1","orangered", "red3", "darkred"), space="rgb")

mycol=mypalette(2*max(GenValMatrix, na.rm=TRUE))

mylabels<-paste(BinLimits[1:length(BinLimits)-1], BinLimits[2:length(BinLimits)], sep="- ", collapse=NULL)

filled.contour(x, y, z=GenValMatrix,plot.title = title(main ="Phytophthora ramorum Pr102 genome",xlab = "five prime intergenic regions", ylab= "three prime intergenic regions", cex.main=0.8, cex.lab=0.8),key.title = title(main ="Number ofgenes", cex.main=0.5,line=1),col=mycol,levels = pretty(zlim, 1*max(GenValMatrix,na.rm=TRUE)),plot.axes={axis(1,at=x, labels=mylabels, las=2,cex.axis=0.5);axis(2,at=y, labels=mylabels,cex.axis=0.5)})

#wget http://wiki.cbr.washington.edu/qerm/sites/qerm/images/1/16/Filled.contour3.R

source('Filled.contour3.R')

library(png)

library(gridExtra)

library(ggplot2)

image_name<-paste(as.character(format(Sys.time(),"%Y%m%d%H%M%S")), "_graph", sep="")

png(filename = paste(image_name, ".png", sep=""))

par(mar=c(0,0,0,0))

filled.contour(x, y, z=GenValMatrix,col=mycol,levels = pretty(zlim, 2*max(GenValMatrix,na.rm=TRUE)),frame.plot = FALSE,axes = FALSE)

dev.off()

img <- readPNG(paste(image_name, ".png", sep=""))

library(gridExtra)

library(grid)

library(ggplot2)

library(lattice)

g <- rasterGrob(img, interpolate=TRUE)

rxlrData=as.data.frame(read.csv('rxrl_whole_intergenic.csv',header=TRUE))

ggplot(data=rxlrData,aes(x=rxlrData$fiveprime,y=rxlrData$threeprime,geom="blank"))+annotation_custom(g,xmin=-Inf,xmax=Inf,ymin=-Inf,ymax=Inf)+coord_fixed(ratio=1)+geom_point(shape=21,fill="red",colour="black",size=2,alpha=0.7,na.rm=FALSE)+scale_y_log10(breaks=BinLimits[2:length(BinLimits)],limits=c(BinLimits[2],BinLimits[NumBins +1]))+scale_x_log10(breaks= BinLimits[2:length(BinLimits)],limits= c(BinLimits[2] ,BinLimits[NumBins +1]))+theme(axis.text.y=element_text(size = 10,vjust=0.5))+theme(axis.text.x=element_text(size=10,vjust=0.5,angle=90))+theme(axis.title.x = element_text(face="bold",size=12))+xlab("five prime intergenic region")+theme(axis.title.y = element_text(face="bold",size=12)) + ylab("threeprime intergenic region")

Below I have put the screen shot for the Phytophthora sojae genome from a sample dataset

minor edits can be done to make it good !!

This code and concept has been acquired from http://biorxiv.org/content/early/2015/07/01/021774

Computational Genomics Lab at IICB

Followers

Tuesday, April 11, 2017

two-speed genome analysis using R and perl

About the group