19 September 2010

Indexing some genomic positions with MongoDB: my benchmark

The aim of this post was to find a good way to index some genomic positions with mongodb. (Update: it was tested on my laptop without replication )

My initial dataset is a list of SNPs on the chromosomes chr22 and chrM from the UCSC.

mysql -N -h genome-mysql.cse.ucsc.edu -A -u genome -D hg18 \
-e 'select chrom,chromStart,name from snp130 where chrom in("chr22","chrM")'


From this dataset, a javascript input for mongodb was generated using the following AWK scripts:
awk -f data2mongo.awk dataset.xls > mongo.js
Each script runs a loop searching some SNPs in a random range on the chr22.
mongo bio mongo.js


The winner: Test 1

Indexing both fields:db.things.ensureIndex({chrom:1,position:1}):
BEGIN {
printf("db.snps.drop();\n");
}

{
printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("db.snps.ensureIndex({chrom:1,position:1});\n");
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}

result: seconds:0.228

Test 2

Indexing each field : db.things.ensureIndex({chrom:1}); and db.things.ensureIndex({position:1});.
BEGIN {
printf("db.snps.drop();\n");
}

{
printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("db.snps.ensureIndex({chrom:1});\n");
printf("db.snps.ensureIndex({position:1});\n");
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}

Result: seconds:0.25



Test 3


No index.


BEGIN {
printf("db.snps.drop();\n");
}

{
printf("db.snps.save({chrom:\"%s\",position:%s,name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({chrom:\"chr22\",position:{$gt:pos1,$lt:pos2}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}

Result:seconds:277.751



Test 4

String padding:the chromosome and the position are concatenated in a fixed-length string


BEGIN {
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}

{
gsub(/chr/,"",$1);
printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}
Result: seconds:169.028

Test 5

string padding + index


BEGIN {
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}

{
gsub(/chr/,"",$1);
printf("db.snps.save({position:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("db.snps.ensureIndex({position:1});\n");
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({position:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}

Result: seconds:0.292



Test 6

_id as a padded string as _id

.
BEGIN {
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}

{
gsub(/chr/,"",$1);
printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find({_id:{$gt:pad(\"22\",pos1),$lt:pad(\"22\",pos2)}});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}

result : seconds:1.252



Test 7

Using a padded string for _id and min()/max() for searching
BEGIN {
printf("db.snps.drop();\n");
printf("function pad2(s,L) { while(s.length<L) { s=\"0\"+s;} return s;}\n");
printf("function pad(chrom,position) { return pad2(chrom,2)+\":\"+pad2(\"\"+position,10);}\n");
}

{
gsub(/chr/,"",$1);
printf("db.snps.save({_id:pad(\"%s\",%s),name:\"%s\"});\n",$1,$2,$3);
}

END {
printf("var tStart=new Date().getTime();\n");
printf("for(i=0;i< 1000;++i)\n{\n");
printf("var pos1 =Math.floor(Math.random()*50000000);\n");
printf("var pos2 =pos1 + Math.floor(Math.random()*10000);\n");
printf("var c=db.snps.find().min({_id:pad(\"22\",pos1)}).max({_id:pad(\"22\",pos2)});\n");
printf("while(c.hasNext()) c.next();\n");
printf("}\n");
printf("print(\"seconds:\"+(new Date().getTime()-tStart)/1000)");
}

result Seconds:3.189



Test 8

I also used a composite _id: db.snps.save({_id:{chrom:"chr22",position:14430966},name:"rs2844899"}); but i was not able to query this table with $gt/$lt:


> db.snps.find({_id:{chrom:"chr22",position:{$gt:14430966}}}).count()
0
> db.snps.find({_id:{chrom:"chr22",position:14430966}}).count()
1


Any other idea for indexing those data ? Feel free to leave a message here or on biostar or stackoverflow.


That's it

Pierre

No comments: