I have discussed about my speed genome analysis on my previous blog, now am writing the steps how to do that
1. calculate the intergenic distance of ur organism "x" from gtf file using the perl script , I have used the augustus predicted gtf file , the file needs to be modified according to the perl script so that it looks for the particular feature and pattern such as it looks for gene/exon/mRNA feature
scaffold_1904 AUGUSTUS gene 1 775 0.09 + . transcript_id "g18705"; gene_id "temp";
scaffold_1 AUGUSTUS transcript 1 4141 0.05 + . g1.t1
scaffold_1 AUGUSTUS intron 1 180 0.55 + . transcript_id "g1.t1"; gene_id "g1";
scaffold_1 AUGUSTUS intron 2022 3511 0.21 + . transcript_id "g1.t1"; gene_id "g1";
scaffold_1 AUGUSTUS CDS 181 2021 0.09 + 2 transcript_id "g1.t1"; gene_id "g1";
scaffold_1 AUGUSTUS CDS 3512 4141 0.49 + 0 transcript_id "g1.t1"; gene_id "g1";
scaffold_1 AUGUSTUS stop_codon 4139 4141 . + 0 transcript_id "g1.t1"; gene_id "g1";
use the perl script Calculate_FIR_length.pl using the gtf file to calculate the intergenic distance between the features, already complimentbed is there but that is not the one which we need, that is for difference purpose when I posted in a github forum to author I came to know the difference, this is the link https://github.com/Adamtaranto/density-Mapr/issues/1#issuecomment-291475238
2. make a gtf file of Avh's with the location and calculate the intergenic distance for Avh's , intergenic 5 prime and 3 prime distance needs to be calculated the intergenic distance of effector file should look like this
3. then just Run the R script which is pasted below just by changing the names of the file, if the points or plots are going out the cut off please change the bin size and num of bins before its creating the heatmap don't change after the heatmap is made
whole_intergene=read.csv(file="intergene_whole.csv",sep=",")
NumBins=50
if ((max(whole_intergene$fiveprime, na.rm=TRUE)>max(whole_intergene$threeprime, na.rm=TRUE)) == TRUE) { whole_intergene2Bin=whole_intergene$fiveprime} else { whole_intergene2Bin=whole_intergene$threeprime}
whole_intergene2Bin=whole_intergene2Bin[which(whole_intergene2Bin!=0)]
whole_intergene2Bin=na.omit(whole_intergene2Bin)
BinSteps=round(length(whole_intergene2Bin)/ (NumBins-20) , digits=10)
whole_intergene2BinOrd=sort(whole_intergene2Bin)
#### The [2*BinSteps] has been changed to [1*Binsteps it was producing an error after googling the error has been fixed]
TempBinLimits=whole_intergene2BinOrd[seq(whole_intergene2BinOrd[2*BinSteps],length(whole_intergene2BinOrd),BinSteps)]
TempBinLimits[length(TempBinLimits)+1]=max(whole_intergene2Bin, na.rm=TRUE)
x<-seq(length(TempBinLimits))
fit<-nls(log(TempBinLimits) ~ a*x + b, start= c(a=0, b=0),algorithm='port',weights=((x-1.0* NumBins)^2))
pred=predict(fit, x)
BinLimits=c(1, round(exp(pred),0), max(whole_intergene2Bin))
xbin=cut(whole_intergene$fiveprime, breaks=c(BinLimits))
ybin=cut(whole_intergene$threeprime, breaks=c(BinLimits))
whole_intergene=cbind(whole_intergene, xbin, ybin, genevalue=rep(1, length (whole_intergene$fiveprime)))
GenValMatrix<-with(whole_intergene, tapply (genevalue, list(xbin, ybin), sum))
x<-1:ncol(GenValMatrix)
y<-1:nrow(GenValMatrix)
zlim = range(as.numeric (unlist(GenValMatrix)) , finite=TRUE)
mypalette<-colorRampPalette(c( "white","darkblue", "forestgreen", "goldenrod1","orangered", "red3", "darkred"), space="rgb")
mycol=mypalette(2*max(GenValMatrix, na.rm=TRUE))
mylabels<-paste(BinLimits[1:length(BinLimits)-1], BinLimits[2:length(BinLimits)], sep="- ", collapse=NULL)
filled.contour(x, y, z=GenValMatrix,plot.title = title(main ="Phytophthora ramorum Pr102 genome",xlab = "five prime intergenic regions", ylab= "three prime intergenic regions", cex.main=0.8, cex.lab=0.8),key.title = title(main ="Number ofgenes", cex.main=0.5,line=1),col=mycol,levels = pretty(zlim, 1*max(GenValMatrix,na.rm=TRUE)),plot.axes={axis(1,at=x, labels=mylabels, las=2,cex.axis=0.5);axis(2,at=y, labels=mylabels,cex.axis=0.5)})
#wget http://wiki.cbr.washington.edu/qerm/sites/qerm/images/1/16/Filled.contour3.R
source('Filled.contour3.R')
library(png)
library(gridExtra)
library(ggplot2)
image_name<-paste(as.character(format(Sys.time(),"%Y%m%d%H%M%S")), "_graph", sep="")
png(filename = paste(image_name, ".png", sep=""))
par(mar=c(0,0,0,0))
filled.contour(x, y, z=GenValMatrix,col=mycol,levels = pretty(zlim, 2*max(GenValMatrix,na.rm=TRUE)),frame.plot = FALSE,axes = FALSE)
dev.off()
img <- readPNG(paste(image_name, ".png", sep=""))
library(gridExtra)
library(grid)
library(ggplot2)
library(lattice)
g <- rasterGrob(img, interpolate=TRUE)
rxlrData=as.data.frame(read.csv('rxrl_whole_intergenic.csv',header=TRUE))
ggplot(data=rxlrData,aes(x=rxlrData$fiveprime,y=rxlrData$threeprime,geom="blank"))+annotation_custom(g,xmin=-Inf,xmax=Inf,ymin=-Inf,ymax=Inf)+coord_fixed(ratio=1)+geom_point(shape=21,fill="red",colour="black",size=2,alpha=0.7,na.rm=FALSE)+scale_y_log10(breaks=BinLimits[2:length(BinLimits)],limits=c(BinLimits[2],BinLimits[NumBins +1]))+scale_x_log10(breaks= BinLimits[2:length(BinLimits)],limits= c(BinLimits[2] ,BinLimits[NumBins +1]))+theme(axis.text.y=element_text(size = 10,vjust=0.5))+theme(axis.text.x=element_text(size=10,vjust=0.5,angle=90))+theme(axis.title.x = element_text(face="bold",size=12))+xlab("five prime intergenic region")+theme(axis.title.y = element_text(face="bold",size=12)) + ylab("threeprime intergenic region")
Below I have put the screen shot for the Phytophthora sojae genome from a sample dataset
This code and concept has been acquired from http://biorxiv.org/content/early/2015/07/01/021774