Binning the genome.
Some tables in the UCSC genome mysql database contains a column named Bin.
mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D hg18
mysql> select bin,name,chrom,chromStart,chromEnd from snp130 where chromStart>10000 and chromEnd<20000 limit 10;
+-----+------------+-------+------------+----------+
| bin | name | chrom | chromStart | chromEnd |
+-----+------------+-------+------------+----------+
| 585 | rs12354060 | chr1 | 10003 | 10004 |
| 585 | rs56336884 | chr1 | 10003 | 10004 |
| 585 | rs3950650 | chr1 | 10020 | 10021 |
| 585 | rs3960288 | chr1 | 10020 | 10021 |
| 585 | rs3971686 | chr1 | 10028 | 10029 |
| 585 | rs4030269 | chr1 | 10028 | 10029 |
| 585 | rs3950649 | chr1 | 10053 | 10054 |
| 585 | rs3960289 | chr1 | 10053 | 10054 |
| 585 | rs3962424 | chr1 | 10053 | 10054 |
| 585 | rs12239663 | chr1 | 10068 | 10069 |
+-----+------------+-------+------------+----------+
10 rows in set (0.21 sec)
I wondered what is that column (it is not a primary key) and after asking for more information on http://biostar.stackexchange.com, I eventually found that this column was an index described by Jim Kent in "The Human Genome Browser at UCSC", doi: 10.1101/gr.229102 . Genome Res. 2002. 12:996-1006 and on the UCSC Wiki. Citing:"Binning scheme for optimizing database accesses for genomic annotations that cover a particular region of the genome( ...) Features are put in the smallest bin in which they fit. (...) Typically, almost all features are in the smaller bins, and in the most common usage scenarios only the contents of a few of these smaller bins need to be examined. (...) Since all of these bins are in sizes of powers of two, the calculation of the bin number is a simple matter of bit shifting of the chromStart and chromEnd coordinates".mysql> select bin,name,chrom,chromStart,chromEnd from snp130 where chromStart>10000 and chromEnd<20000 limit 10;
+-----+------------+-------+------------+----------+
| bin | name | chrom | chromStart | chromEnd |
+-----+------------+-------+------------+----------+
| 585 | rs12354060 | chr1 | 10003 | 10004 |
| 585 | rs56336884 | chr1 | 10003 | 10004 |
| 585 | rs3950650 | chr1 | 10020 | 10021 |
| 585 | rs3960288 | chr1 | 10020 | 10021 |
| 585 | rs3971686 | chr1 | 10028 | 10029 |
| 585 | rs4030269 | chr1 | 10028 | 10029 |
| 585 | rs3950649 | chr1 | 10053 | 10054 |
| 585 | rs3960289 | chr1 | 10053 | 10054 |
| 585 | rs3962424 | chr1 | 10053 | 10054 |
| 585 | rs12239663 | chr1 | 10068 | 10069 |
+-----+------------+-------+------------+----------+
10 rows in set (0.21 sec)
Kent & al. The Human Genome Browser at UCSC", doi: 10.1101/gr.229102 . Genome Res. 2002. 12:996-1006
Jim Kent's C implementation is small and elegant (it uses some clever bit shifts operators (see source code). I've tried to implement my own java version (less elegant (it uses a recursive function and I'm not Jim Kent) but more readeable (at least for me)).
/** Given start,end in chromosome coordinates assign it
* a bin. A range goes into the smallest bin it will fit in. */
public int getBin(int chromStart,int chromEnd)
{
int genomicLength=getMaxGenomicLengthLevel();
return calcBin(chromStart,chromEnd,0,0,0,0,1,0,genomicLength);
}
/** length for level 0 */
protected int getMaxGenomicLengthLevel() { return 536870912;/* 2^29 */}
/** maximum level in Jim Kent's algorithm */
protected int getMaxLevel() { return 4;}
/** how many children for one node ? */
protected int getChildrenCount() { return 8;}
private int calcBin(
final int chromStart,
final int chromEnd,
int binId,
int level,
int binRowStart,
int rowIndex,
int binRowCount,
int genomicPos,
int genomicLength
)
{
if(
chromStart>=genomicPos &&
chromEnd<= (genomicPos+genomicLength))
{
if(level>=getMaxLevel()) return binId;
int childLength=genomicLength/getChildrenCount();
int childBinRowCount=binRowCount*getChildrenCount();
int childRowBinStart=binRowStart+binRowCount;
int firstChildIndex=rowIndex*getChildrenCount();
int firstChildBin=childRowBinStart+firstChildIndex;
for(int i=0;i< getChildrenCount();++i)
{
int n= calcBin(
chromStart,
chromEnd,
firstChildBin+i,
level+1,
childRowBinStart,
firstChildIndex+i,
childBinRowCount,
genomicPos+i*childLength,
childLength
);
if(n!=-1)
{
return n;
}
}
return binId;
}
return -1;
}
The same kind of function can also be used to find all the bins that should be scanned for finding the objects lying under a given genomic region.* a bin. A range goes into the smallest bin it will fit in. */
public int getBin(int chromStart,int chromEnd)
{
int genomicLength=getMaxGenomicLengthLevel();
return calcBin(chromStart,chromEnd,0,0,0,0,1,0,genomicLength);
}
/** length for level 0 */
protected int getMaxGenomicLengthLevel() { return 536870912;/* 2^29 */}
/** maximum level in Jim Kent's algorithm */
protected int getMaxLevel() { return 4;}
/** how many children for one node ? */
protected int getChildrenCount() { return 8;}
private int calcBin(
final int chromStart,
final int chromEnd,
int binId,
int level,
int binRowStart,
int rowIndex,
int binRowCount,
int genomicPos,
int genomicLength
)
{
if(
chromStart>=genomicPos &&
chromEnd<= (genomicPos+genomicLength))
{
if(level>=getMaxLevel()) return binId;
int childLength=genomicLength/getChildrenCount();
int childBinRowCount=binRowCount*getChildrenCount();
int childRowBinStart=binRowStart+binRowCount;
int firstChildIndex=rowIndex*getChildrenCount();
int firstChildBin=childRowBinStart+firstChildIndex;
for(int i=0;i< getChildrenCount();++i)
{
int n= calcBin(
chromStart,
chromEnd,
firstChildBin+i,
level+1,
childRowBinStart,
firstChildIndex+i,
childBinRowCount,
genomicPos+i*childLength,
childLength
);
if(n!=-1)
{
return n;
}
}
return binId;
}
return -1;
}
Testing this 'bin' thing
Connecting to the UCSC:
mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -D hg18
Count the number of SNPs on chr1 between 10000 and 20000.
select count(*) from snp130 where chrom="chr1" and chromStart>10000 and chromEnd<20000 ;
+----------+
| count(*) |
+----------+
| 487 |
+----------+
1 row in set (7.35 sec)
+----------+
| count(*) |
+----------+
| 487 |
+----------+
1 row in set (7.35 sec)
Using my piece of java code, I've found that the bins covering the region chr1:10000-20000 are:
- 0
- 1
- 585
- 9
- 73
OK, now let's redo the first query using the bin.
select count(*) from snp130 where chrom="chr1" and chromStart>10000 and chromEnd<20000 and bin in (0,1,585,9,73);
+----------+
| count(*) |
+----------+
| 487 |
+----------+
1 row in set (0.21 sec)
"0.21 sec" vs "7.35 sec" : conclusion: using the bin increases the speed of 35x.+----------+
| count(*) |
+----------+
| 487 |
+----------+
1 row in set (0.21 sec)
That's it
Pierre
2 comments:
Interesting. what is the performance difference if you execute the queries in different order? The first query caches the relevant data blocks into cache and the second query benefits from that.
When comparing two queries, wall clock time is not the best indicator because it depends on other programs, cache state etc.
I'm guessing here, but you could get even better performance if you had concatenated index on chromStart and chromEnd. No bins needed.
yes, I don't think that your 2nd index would be more useful for finding some objects within a given interval.
Post a Comment