24 October 2010

Where are the alternative reading frames in the Human Genome ?

The following post was inspired by a question asked recently on Biostar :"Do exons ever have different reading frames in spliced variants?".

To find those alternative reading frames I've used the table KnownGene available at UCSC from : http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/knownGene.txt.gz. This file contains the positions of the exons for each transcript in the human genome:

mysql -h genome-mysql.cse.ucsc.edu -A -u genome -D hg18 -e 'select * from knownGene limit 10\G'

(...)
*************************** 7. row ***************************
name: uc009vis.1
chrom: chr1
strand: -
txStart: 4268
txEnd: 6628
cdsStart: 4268
cdsEnd: 4268
exonCount: 4
exonStarts: 4268,4832,5658,6469,
exonEnds: 4692,4901,5805,6628,
proteinID:
alignID: uc009vis.1
*************************** 8. row ***************************
name: uc009vit.1
chrom: chr1
strand: -
txStart: 4268
txEnd: 9622
cdsStart: 4268
cdsEnd: 4268
exonCount: 9
exonStarts: 4268,4832,5658,6469,6720,7095,7777,8130,8775,
exonEnds: 4692,4901,5810,6628,6918,7605,7924,8229,9622,
proteinID:
alignID: uc009vit.1


The following java program creates an array of bytes having a length greater than the length of the human chromosome chr1. This array is initialized with the constant 'NIL'. Then for each chromosome and each transcript, we loop over each exon and we record what was the reading frame (0, 1 or 2) at a given position. If this position was already flagged with another frame, a warning is printed to stdout.
/**
* search for alternative Reading frames in human Genome (hg18)
* Pierre Lindenbaum 2010
* http://biostar.stackexchange.com/questions/3034
*/
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.Arrays;
import java.util.zip.GZIPInputStream;
public class BioStar3034
{
private static final byte NIL=(byte)99;
private static final byte ALT_FRAME=(byte)80;
/** scans each transcript from knownGene.txt.gz */
private void run() throws Exception
{
//an array of frame 0/1/2 or NIL if not filled
byte genome[]=new byte[247249800];//~ size of human chr1
//download UCSC knownGene
BufferedReader r=new BufferedReader(new InputStreamReader(
new GZIPInputStream( new URL(
"http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/knownGene.txt.gz").openStream())
));
String line;
String prevChrom=null;
while((line=r.readLine())!=null)
{
String tokens[]=line.split("[\t]");
String chrom=tokens[1];
if(prevChrom==null || !(prevChrom.equals(chrom)))
{
Arrays.fill(genome,0,genome.length,NIL);//all the frame
prevChrom=chrom;
}
int cdsStart=Integer.parseInt(tokens[5]);
int cdsEnd=Integer.parseInt(tokens[6]);
int exonsCount=Integer.parseInt(tokens[7]);
String exonStarts[]=tokens[8].split("[,]");
String exonEnds[]=tokens[9].split("[,]");
int starts[]=new int[exonsCount];
int ends[]=new int[exonsCount];
for(int i=0;i< exonsCount;++i)
{
starts[i]=Integer.parseInt(exonStarts[i]);
ends[i]=Integer.parseInt(exonEnds[i]);
}
int cDNA=0;
//orientation is forward
if(tokens[2].equals("+"))
{
int position=cdsStart;
//loop over the cDNA and flag the frames 0/1/2
while(position<cdsEnd)
{
//check position is in an exon
for(int exon=0;exon < exonsCount;++exon)
{
if(!(starts[exon]<=position && position< ends[exon])) continue;
byte frame=(byte)(cDNA%3);
//we have been there before and it was not the same frame !
if( genome[position]!=NIL &&
genome[position]!=frame)
{
genome[position]=ALT_FRAME;
System.out.println(chrom+":"+(position+1)+"-"+(position+2)+" (+)");
}
else
{
genome[position]=frame;
}
cDNA++;
break;
}
++position;
}
}
else //gene on reverse strand'
{
int position=cdsEnd-1;
//loop over the cDNA and flag the frames 0/1/2
while(position>=cdsStart)
{
//check position is in an exon
for(int exon=0;exon < exonsCount;++exon)
{
if(!(starts[exon]<=position && position< ends[exon])) continue;
byte frame=(byte)(10+cDNA%3);//add 10 to flag reverse orientation
//we have been there before and it was not the same frame !
if( genome[position]!=NIL &&
genome[position]!=frame)
{
genome[position]=ALT_FRAME;
System.out.println(chrom+":"+(position+1)+"-"+(position+2)+" (-)");
}
else
{
genome[position]=frame;
}
genome[position]=frame;
cDNA++;
break;
}
--position;
}
}
}
r.close();
}
public static void main(String[] args)
{
try {
BioStar3034 app=new BioStar3034();
app.run();
}
catch (Exception e)
{
e.printStackTrace();
}
}
}

Compilation & Execution


javac BioStar3034.java
java BioStar3034

Result


(...)
chr1:53286155-53286156 (+)
chr1:53286156-53286157 (+)
chr1:53286157-53286158 (+)
chr1:53286158-53286159 (+)
chr1:53286159-53286160 (+)
chr1:53286160-53286161 (+)
chr1:53286161-53286162 (+)
chr1:53286162-53286163 (+)
(...)
java BioStar3034 | sort | uniq | wc -l
300696



(Image from UCSC/OpenWetWare)


That's it
Pierre

No comments: