Storing some SNPs using the leveldb library. My notebook.
In this post I'll describe how I've used the leveldb API, a C++ key-value database, to store some SNPs (key=rs,value=sequence). "LevelDB is a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.". A benchmark for this engine is available here: http://code.google.com/p/leveldb/source/browse/trunk/doc/benchmark.html.
Download & install
$ svn checkout http://leveldb.googlecode.com/svn/trunk/ leveldb-read-only
$ cd leveldb-read-only/
$ make
Open & close a leveldb database
#include "leveldb/db.h"
(...)
leveldb::DB* db=NULL;
leveldb::Options options;
options.comparator=&rs_comparator;/* custom comparator for ordering the keys, see later */
options.create_if_missing = true;
clog << "[LOG]opening database" << endl;
leveldb::Status status = leveldb::DB::Open(options,db_home, &db);
//check status
if (!status.ok())
{
cerr << "cannot open " << db_home << " : " << status.ToString() << endl;
return EXIT_FAILURE;
}
(...)
/* use the database */
(...)
delete db;
A custom comparator for ordering the keys
Here, the keys are the rs-ids ordered on their numerical value. Both keys and values have a type"leveldb::Slice"
.class RsComparator : public leveldb::Comparator
{
private:
/* parses the rsId: rs[0-9]+ */
int rs(const leveldb::Slice& s) const
{
int n=0;
for(size_t i=2;i< s.size();++i)
{
n=n*10+s[i]-'0';
}
return n;
}
public:
virtual int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const
{
return rs(a)-rs(b);
}
(...)
}
(...)
RsComparator rs_comparator;
options.comparator=&rs_comparator;
Inserting a FASTA sequence
(...)
std::string name;
std::string sequence;
(...)
leveldb::Status status = db->Put(leveldb::WriteOptions(), name, sequence);
if(!status.ok())
{
cerr << "Cannot insert "<< name << " "
<< status.ToString() << endl;
return EXIT_FAILURE;
}
Searching for a rs###
(...)
std::string name(argv[optind]);
std::string sequence;
(...)
leveldb::Status status = db->Get(leveldb::ReadOptions(),seqname, &sequence);
if(!status.ok())
{
cerr << "Cannot find " << seqname<< " in " << db_home << endl;
continue;
}
printFasta(seqname,sequence);
(...)
Dumping all the SNPs using an iterator
(...)
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
for (it->SeekToFirst(); it->Valid(); it->Next())
{
printFasta(it->key(),it->value());
}
delete it;
(...)
Examples
Reading the SNPs:$ curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chMT.fas.gz" |\Dump the snps
./rsput -D snp.db
[LOG]opening database
[LOG] added rs8896
[LOG] added rs8936
[LOG] added rs9743
(...)
[LOG]closing database
$ du -h snp.db
336K snp.db
$ ./rsget -D snp.dbSearch for some SNPs:
[LOG]opening database
>rs8896
GGTGTTGGTTCTCTTAATCTTTAACTTAAAAGGTTAATGCTAAGTTAGCTTTACAGTGGG
CTCTAGAGGGGGTAGAGGGGGTGYTATAGGGTAAATACGGGCCCTATTTCAAAGATTTTT
AGGGGAATTAATTCTAGGACGATGGGCATGAAACTGTGGTTTGCTCCACAGATTTCAGAG
CATT
>rs8936
ACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCG
ACCTGCGACTCCTTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTA
TAATAATTACATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAA
CAGATGCAATTCCCGGACGTHTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTAT
ACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAG
AATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCT
ACCCCCTCTAGAGCCCACTGTAAAGCTAACTTAGCATTAAC
>rs9743
CCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTAACCAACACAC
TAACCATATACCAATGATGNCGCGATGTAACACGAGAAAGCACATACCAAGGCCACCACA
CACCACCTGTCCAAAAAGGCCTTCGATACGGGATAATCCTATTTATTACCTCAGAANTTT
TTTTCTTCGCAGGATTTTTCTGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCCCAAN
(...)
[LOG]closing database
$ ./rsget -D snp.db rs78894381 rs72619361 rs25
[LOG]opening database
[LOG]searching for rs78894381
>rs78894381
CTACTAATCTCATCAACACAACCCCCGCCCATCCTACCCAGCACACACACACCGCTGCTA
ACCCCATACCCCGAACCAACCAAACCCCAAAGACACCCCCNCACAGTTTATGTAGCTTAC
CTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAAATA
GGTTTGGTCCTAGCCTTTCTA
[LOG]searching for rs72619361
>rs72619361
ATGCATTTGGTATTTTAATCTGGGGGGTGTGCACGCGATAGCATTGTGAAACGCTGGCCC
CAGAGCACCCTATGTCGCAGTGTCTGTCTTTGATTCCTGCCYCATCCCATTATTGATCAC
ACCTACATTCAATATCCCAGGCGAGCATACCTATCACAAGGTGTTAATTAATTAATGCTT
GTAGGACATAACAATCAGTAAAC
[LOG]searching for rs25
Cannot find rs25 in snp.db
[LOG]closing database
Source code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CC=g++ | |
LEVELDB=/your/path/to/leveldb-read-only | |
CFLAGS=-I$(LEVELDB)/include -Wall -O2 | |
LDFLAGS=-L$(LEVELDB) -lz -lleveldb -lpthread | |
all:test | |
test:rsdb | |
curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chMT.fas.gz" |\ | |
./rsput -D snp.db | |
./rsget -D snp.db | |
./rsget -D snp.db rs78894381 rs72619361 rs25 | |
rsdb:rsput rsget | |
rsput:snp.c | |
$(CC) -o $@ $(CFLAGS) $< $(LDFLAGS) | |
rsget:snp.c | |
$(CC) -o $@ $(CFLAGS) -DREADONLY $< $(LDFLAGS) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Author: | |
* Pierre Lindenbaum PhD | |
* Contact: | |
* plindenbaum@yahoo.fr | |
* WWW: | |
* http://plindenbaum.blogspot.com | |
* Reference: | |
* http://code.google.com/p/leveldb/ | |
* Motivation: | |
* stores some SNPs using the leveldb library | |
* Usage: | |
* curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chMT.fas.gz" | ./rsput -D snp.db | |
* ./rsget -D snp.db | |
* ./rsget -D snp.db rs78894381 rs72619361 rs25 | |
*/ | |
#include <cstdio> | |
#include <cstdlib> | |
#include <cstring> | |
#include <iostream> | |
#include <cassert> | |
#include <cerrno> | |
#include <zlib.h> | |
#include "leveldb/comparator.h" | |
#include "leveldb/db.h" | |
using namespace std; | |
/** compare the rs## on their numeric ID */ | |
class RsComparator : public leveldb::Comparator | |
{ | |
private: | |
int rs(const leveldb::Slice& s) const | |
{ | |
int n=0; | |
for(size_t i=2;i< s.size();++i) | |
{ | |
n=n*10+s[i]-'0'; | |
} | |
return n; | |
} | |
public: | |
virtual int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const | |
{ | |
if(!(a.size()>2 && a[0]=='r' && a[1]=='s' && b.size()>2 && b[0]=='r' && b[1]=='s' )) | |
{ | |
cerr << "error; expected two rs## but got " << a.ToString() << " and " << b.ToString() << endl; | |
exit(EXIT_FAILURE); | |
} | |
return rs(a)-rs(b); | |
} | |
virtual const char* Name() const { return "Rs#Comparator"; } | |
virtual void FindShortestSeparator(std::string*, const leveldb::Slice&) const { } | |
virtual void FindShortSuccessor(std::string*) const { } | |
}; | |
#ifdef READONLY | |
static void printFasta(const leveldb::Slice& name,const leveldb::Slice& seq) | |
{ | |
cout << ">"; | |
cout.write(name.data(),name.size()); | |
for(size_t i=0;i< seq.size();++i) | |
{ | |
if(i%60==0) cout << endl; | |
cout << seq[i]; | |
} | |
cout <<endl; | |
} | |
#else | |
static int readFasta(leveldb::DB* db,gzFile in) | |
{ | |
string name; | |
string sequence; | |
sequence.reserve(1000000); | |
for(;;) | |
{ | |
int c=gzgetc(in); | |
if(c==EOF || c=='>') | |
{ | |
if(sequence.size()>0) | |
{ | |
string::size_type i=name.find("rs="); | |
string::size_type j; | |
if(i!=string::npos && i+3< name.size() && | |
isdigit(name[i+3]) && | |
(j=name.find_first_not_of("0123456789",i+3))!=string::npos && | |
name[j]=='|' | |
) | |
{ | |
//remove head, tail and '=' in ...rs=98|... | |
name.erase(j).erase(0,i).erase(2,1); | |
} | |
else | |
{ | |
cerr << "Cannot find rs=<digit>+ in "<< name << " " << endl; | |
return EXIT_FAILURE; | |
} | |
leveldb::Status status = db->Put(leveldb::WriteOptions(), name, sequence); | |
if(!status.ok()) | |
{ | |
cerr << "Cannot insert "<< name << " " | |
<< status.ToString() << endl; | |
return EXIT_FAILURE; | |
} | |
clog << "[LOG] added " << name << endl; | |
} | |
if(c==EOF) break; | |
sequence.clear(); | |
name.clear(); | |
while((c=gzgetc(in))!=EOF && c!='\n') | |
{ | |
name+=(char)c; | |
} | |
continue; | |
} | |
else if(c=='#') //comments at the end of dbSNP fasta file! | |
{ | |
while((c=gzgetc(in))!=EOF && c!='\n') ; | |
continue; | |
} | |
if(!isalpha(c)) continue; | |
sequence+=(char)c; | |
} | |
return EXIT_SUCCESS; | |
} | |
#endif | |
int main(int argc,char** argv) | |
{ | |
char* db_home=NULL; | |
int optind=1; | |
RsComparator rs_comparator; | |
while(optind < argc) | |
{ | |
if(strcmp(argv[optind],"-h")==0) | |
{ | |
cerr << argv[0] << ": Pierre Lindenbaum PHD. 2011." << endl; | |
return(EXIT_SUCCESS); | |
} | |
else if((strcmp(argv[optind],"-D")==0 || strcmp(argv[optind],"--db-home")==0) | |
&& optind+1< argc) | |
{ | |
db_home=argv[++optind]; | |
} | |
else if(strcmp(argv[optind],"--")==0) | |
{ | |
++optind; | |
break; | |
} | |
else if(argv[optind][0]=='-') | |
{ | |
cerr << "unknown option '"<<argv[optind] << "'" << endl; | |
return(EXIT_FAILURE); | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
if(db_home==NULL) | |
{ | |
cerr << "undefined db_home" << endl; | |
return(EXIT_FAILURE); | |
} | |
leveldb::DB* db=NULL; | |
leveldb::Options options; | |
options.comparator=&rs_comparator; | |
#ifdef READONLY | |
options.create_if_missing = false; | |
#else | |
options.create_if_missing = true; | |
#endif | |
clog << "[LOG]opening database" << endl; | |
leveldb::Status status = leveldb::DB::Open(options,db_home, &db); | |
//check status | |
if (!status.ok()) | |
{ | |
cerr << "cannot open " << db_home << " : " << status.ToString() << endl; | |
return EXIT_FAILURE; | |
} | |
#ifdef READONLY | |
/* dump all, using an iterator */ | |
if(optind==argc) | |
{ | |
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions()); | |
for (it->SeekToFirst(); it->Valid(); it->Next()) | |
{ | |
printFasta(it->key(),it->value()); | |
} | |
delete it; | |
} | |
else/* loop over the names */ | |
{ | |
while(optind< argc) | |
{ | |
string seqname(argv[optind++]); | |
string sequence; | |
clog << "[LOG]searching for "<< seqname << endl; | |
leveldb::Status status = db->Get(leveldb::ReadOptions(),seqname, &sequence); | |
if(!status.ok()) | |
{ | |
cerr << "Cannot find " << seqname<< " in " << db_home << endl; | |
continue; | |
} | |
printFasta(seqname,sequence); | |
} | |
} | |
#else | |
/* read from stdin */ | |
if(optind==argc) | |
{ | |
readFasta(db,gzdopen(fileno(stdin), "r")); | |
} | |
else | |
{ | |
/** loop over each fasta file */ | |
while(optind< argc) | |
{ | |
char* filename=argv[optind++]; | |
errno=0; | |
gzFile in=gzopen(filename,"rb"); | |
if(in==NULL) | |
{ | |
cerr << argv[0] | |
<< ": Cannot open "<< filename | |
<< ":" << strerror(errno) | |
<< endl; | |
exit(EXIT_FAILURE); | |
} | |
int ret=readFasta(db,in); | |
gzclose(in); | |
if(ret!=0) break; | |
} | |
} | |
#endif | |
/** close the leveldb database */ | |
clog << "[LOG]closing database" << endl; | |
delete db; | |
return EXIT_SUCCESS; | |
} |
That's it,
Pierre
1 comment:
Thanks, looks promising
I'm always looking for a persistent and friendly ways to store some key-value pairs
Performance wise looks very nice though tests presented were a bit small so ill just grab it and have a look
again thanks for sharing and for your cool posts!
regards
adam
Post a Comment