Indexing some genomic positions with MongoDB: my benchmark
The aim of this post was to find a good way to index some genomic positions with mongodb. (Update: it was tested on my laptop without replication )
My initial dataset is a list of SNPs on the chromosomes chr22 and chrM from the UCSC.
mysql -N -h genome-mysql.cse.ucsc.edu -A -u genome -D hg18 \
-e 'select chrom,chromStart,name from snp130 where chrom in("chr22","chrM")'
-e 'select chrom,chromStart,name from snp130 where chrom in("chr22","chrM")'
From this dataset, a javascript input for mongodb was generated using the following AWK scripts:
awk -f data2mongo.awk dataset.xls > mongo.js
Each script runs a loop searching some SNPs in a random range on the chr22.mongo bio mongo.js
The winner: Test 1
Indexing both fields:db.things.ensureIndex({chrom:1,position:1})
:BEGIN {
printf("db.snps.drop();\n");
}
{
printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("db.snps.ensureIndex({chrom:1,position:1});\n");
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
printf("db.snps.drop();\n");
}
{
printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("db.snps.ensureIndex({chrom:1,position:1});\n");
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
result: seconds:0.228
Test 2
Indexing each field :db.things.ensureIndex({chrom:1});
and db.things.ensureIndex({position:1});
.BEGIN {
printf("db.snps.drop();\n");
}
{
printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("db.snps.ensureIndex({chrom:1});\n");
printf("db.snps.ensureIndex({position:1});\n");
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
printf("db.snps.drop();\n");
}
{
printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("db.snps.ensureIndex({chrom:1});\n");
printf("db.snps.ensureIndex({position:1});\n");
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
Result: seconds:0.25
Test 3
No index.
BEGIN {
printf("db.snps.drop();\n");
}
{
printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
printf("db.snps.drop();\n");
}
{
printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
Result:seconds:277.751
Test 4
String padding:the chromosome and the position are concatenated in a fixed-length string
BEGIN {
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}
{
gsub(/chr/,"",$1);
printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
Result: seconds:169.028printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}
{
gsub(/chr/,"",$1);
printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
Test 5
string padding + index
BEGIN {
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}
{
gsub(/chr/,"",$1);
printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("db.snps.ensureIndex({position:1});\n");
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}
{
gsub(/chr/,"",$1);
printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("db.snps.ensureIndex({position:1});\n");
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
Result: seconds:0.292
Test 6
_id as a padded string as _id
.BEGIN {
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}
{
gsub(/chr/,"",$1);
printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({_id:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}
{
gsub(/chr/,"",$1);
printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({_id:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
result : seconds:1.252
Test 7
Using a padded string for _id and min()/max() for searchingBEGIN {
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}
{
gsub(/chr/,"",$1);
printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find().min({_id:pad(\"22\",pos1)}).max({_id:pad(\"22\",pos2)});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}
{
gsub(/chr/,"",$1);
printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}
END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find().min({_id:pad(\"22\",pos1)}).max({_id:pad(\"22\",pos2)});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
result Seconds:3.189
Test 8
I also used a composite _id: db.snps.save({_id:{chrom:"chr22",position:14430966},name:"rs2844899"});
but i was not able to query this table with $gt/$lt:
> db.snps.find({_id:{chrom:"chr22",position:{$gt:14430966}}}).count()
0
> db.snps.find({_id:{chrom:"chr22",position:14430966}}).count()
1
0
> db.snps.find({_id:{chrom:"chr22",position:14430966}}).count()
1
Any other idea for indexing those data ? Feel free to leave a message here or on biostar or stackoverflow.
That's it
Pierre
No comments:
Post a Comment