Storing some SNPs using the leveldb library. My notebook.
In this post I'll describe how I've used the leveldb API, a C++ key-value database, to store some SNPs (key=rs,value=sequence). "LevelDB is a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.". A benchmark for this engine is available here: http://code.google.com/p/leveldb/source/browse/trunk/doc/benchmark.html.
Download & install
$ svn checkout http://leveldb.googlecode.com/svn/trunk/ leveldb-read-only
$ cd leveldb-read-only/
$ make
Open & close a leveldb database
#include "leveldb/db.h"
(...)
leveldb::DB* db=NULL;
leveldb::Options options;
options.comparator=&rs_comparator;/* custom comparator for ordering the keys, see later */
options.create_if_missing = true;
clog << "[LOG]opening database" << endl;
leveldb::Status status = leveldb::DB::Open(options,db_home, &db);
//check status
if (!status.ok())
{
cerr << "cannot open " << db_home << " : " << status.ToString() << endl;
return EXIT_FAILURE;
}
(...)
/* use the database */
(...)
delete db;
A custom comparator for ordering the keys
Here, the keys are the rs-ids ordered on their numerical value. Both keys and values have a type"leveldb::Slice"
.class RsComparator : public leveldb::Comparator
{
private:
/* parses the rsId: rs[0-9]+ */
int rs(const leveldb::Slice& s) const
{
int n=0;
for(size_t i=2;i< s.size();++i)
{
n=n*10+s[i]-'0';
}
return n;
}
public:
virtual int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const
{
return rs(a)-rs(b);
}
(...)
}
(...)
RsComparator rs_comparator;
options.comparator=&rs_comparator;
Inserting a FASTA sequence
(...)
std::string name;
std::string sequence;
(...)
leveldb::Status status = db->Put(leveldb::WriteOptions(), name, sequence);
if(!status.ok())
{
cerr << "Cannot insert "<< name << " "
<< status.ToString() << endl;
return EXIT_FAILURE;
}
Searching for a rs###
(...)
std::string name(argv[optind]);
std::string sequence;
(...)
leveldb::Status status = db->Get(leveldb::ReadOptions(),seqname, &sequence);
if(!status.ok())
{
cerr << "Cannot find " << seqname<< " in " << db_home << endl;
continue;
}
printFasta(seqname,sequence);
(...)
Dumping all the SNPs using an iterator
(...)
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
for (it->SeekToFirst(); it->Valid(); it->Next())
{
printFasta(it->key(),it->value());
}
delete it;
(...)
Examples
Reading the SNPs:$ curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chMT.fas.gz" |\Dump the snps
./rsput -D snp.db
[LOG]opening database
[LOG] added rs8896
[LOG] added rs8936
[LOG] added rs9743
(...)
[LOG]closing database
$ du -h snp.db
336K snp.db
$ ./rsget -D snp.dbSearch for some SNPs:
[LOG]opening database
>rs8896
GGTGTTGGTTCTCTTAATCTTTAACTTAAAAGGTTAATGCTAAGTTAGCTTTACAGTGGG
CTCTAGAGGGGGTAGAGGGGGTGYTATAGGGTAAATACGGGCCCTATTTCAAAGATTTTT
AGGGGAATTAATTCTAGGACGATGGGCATGAAACTGTGGTTTGCTCCACAGATTTCAGAG
CATT
>rs8936
ACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCG
ACCTGCGACTCCTTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTA
TAATAATTACATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAA
CAGATGCAATTCCCGGACGTHTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTAT
ACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAG
AATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCT
ACCCCCTCTAGAGCCCACTGTAAAGCTAACTTAGCATTAAC
>rs9743
CCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTAACCAACACAC
TAACCATATACCAATGATGNCGCGATGTAACACGAGAAAGCACATACCAAGGCCACCACA
CACCACCTGTCCAAAAAGGCCTTCGATACGGGATAATCCTATTTATTACCTCAGAANTTT
TTTTCTTCGCAGGATTTTTCTGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCCCAAN
(...)
[LOG]closing database
$ ./rsget -D snp.db rs78894381 rs72619361 rs25
[LOG]opening database
[LOG]searching for rs78894381
>rs78894381
CTACTAATCTCATCAACACAACCCCCGCCCATCCTACCCAGCACACACACACCGCTGCTA
ACCCCATACCCCGAACCAACCAAACCCCAAAGACACCCCCNCACAGTTTATGTAGCTTAC
CTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAAATA
GGTTTGGTCCTAGCCTTTCTA
[LOG]searching for rs72619361
>rs72619361
ATGCATTTGGTATTTTAATCTGGGGGGTGTGCACGCGATAGCATTGTGAAACGCTGGCCC
CAGAGCACCCTATGTCGCAGTGTCTGTCTTTGATTCCTGCCYCATCCCATTATTGATCAC
ACCTACATTCAATATCCCAGGCGAGCATACCTATCACAAGGTGTTAATTAATTAATGCTT
GTAGGACATAACAATCAGTAAAC
[LOG]searching for rs25
Cannot find rs25 in snp.db
[LOG]closing database
Source code
CC=g++ | |
LEVELDB=/your/path/to/leveldb-read-only | |
CFLAGS=-I$(LEVELDB)/include -Wall -O2 | |
LDFLAGS=-L$(LEVELDB) -lz -lleveldb -lpthread | |
all:test | |
test:rsdb | |
curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chMT.fas.gz" |\ | |
./rsput -D snp.db | |
./rsget -D snp.db | |
./rsget -D snp.db rs78894381 rs72619361 rs25 | |
rsdb:rsput rsget | |
rsput:snp.c | |
$(CC) -o $@ $(CFLAGS) $< $(LDFLAGS) | |
rsget:snp.c | |
$(CC) -o $@ $(CFLAGS) -DREADONLY $< $(LDFLAGS) |
/** | |
* Author: | |
* Pierre Lindenbaum PhD | |
* Contact: | |
* plindenbaum@yahoo.fr | |
* WWW: | |
* http://plindenbaum.blogspot.com | |
* Reference: | |
* http://code.google.com/p/leveldb/ | |
* Motivation: | |
* stores some SNPs using the leveldb library | |
* Usage: | |
* curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chMT.fas.gz" | ./rsput -D snp.db | |
* ./rsget -D snp.db | |
* ./rsget -D snp.db rs78894381 rs72619361 rs25 | |
*/ | |
#include <cstdio> | |
#include <cstdlib> | |
#include <cstring> | |
#include <iostream> | |
#include <cassert> | |
#include <cerrno> | |
#include <zlib.h> | |
#include "leveldb/comparator.h" | |
#include "leveldb/db.h" | |
using namespace std; | |
/** compare the rs## on their numeric ID */ | |
class RsComparator : public leveldb::Comparator | |
{ | |
private: | |
int rs(const leveldb::Slice& s) const | |
{ | |
int n=0; | |
for(size_t i=2;i< s.size();++i) | |
{ | |
n=n*10+s[i]-'0'; | |
} | |
return n; | |
} | |
public: | |
virtual int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const | |
{ | |
if(!(a.size()>2 && a[0]=='r' && a[1]=='s' && b.size()>2 && b[0]=='r' && b[1]=='s' )) | |
{ | |
cerr << "error; expected two rs## but got " << a.ToString() << " and " << b.ToString() << endl; | |
exit(EXIT_FAILURE); | |
} | |
return rs(a)-rs(b); | |
} | |
virtual const char* Name() const { return "Rs#Comparator"; } | |
virtual void FindShortestSeparator(std::string*, const leveldb::Slice&) const { } | |
virtual void FindShortSuccessor(std::string*) const { } | |
}; | |
#ifdef READONLY | |
static void printFasta(const leveldb::Slice& name,const leveldb::Slice& seq) | |
{ | |
cout << ">"; | |
cout.write(name.data(),name.size()); | |
for(size_t i=0;i< seq.size();++i) | |
{ | |
if(i%60==0) cout << endl; | |
cout << seq[i]; | |
} | |
cout <<endl; | |
} | |
#else | |
static int readFasta(leveldb::DB* db,gzFile in) | |
{ | |
string name; | |
string sequence; | |
sequence.reserve(1000000); | |
for(;;) | |
{ | |
int c=gzgetc(in); | |
if(c==EOF || c=='>') | |
{ | |
if(sequence.size()>0) | |
{ | |
string::size_type i=name.find("rs="); | |
string::size_type j; | |
if(i!=string::npos && i+3< name.size() && | |
isdigit(name[i+3]) && | |
(j=name.find_first_not_of("0123456789",i+3))!=string::npos && | |
name[j]=='|' | |
) | |
{ | |
//remove head, tail and '=' in ...rs=98|... | |
name.erase(j).erase(0,i).erase(2,1); | |
} | |
else | |
{ | |
cerr << "Cannot find rs=<digit>+ in "<< name << " " << endl; | |
return EXIT_FAILURE; | |
} | |
leveldb::Status status = db->Put(leveldb::WriteOptions(), name, sequence); | |
if(!status.ok()) | |
{ | |
cerr << "Cannot insert "<< name << " " | |
<< status.ToString() << endl; | |
return EXIT_FAILURE; | |
} | |
clog << "[LOG] added " << name << endl; | |
} | |
if(c==EOF) break; | |
sequence.clear(); | |
name.clear(); | |
while((c=gzgetc(in))!=EOF && c!='\n') | |
{ | |
name+=(char)c; | |
} | |
continue; | |
} | |
else if(c=='#') //comments at the end of dbSNP fasta file! | |
{ | |
while((c=gzgetc(in))!=EOF && c!='\n') ; | |
continue; | |
} | |
if(!isalpha(c)) continue; | |
sequence+=(char)c; | |
} | |
return EXIT_SUCCESS; | |
} | |
#endif | |
int main(int argc,char** argv) | |
{ | |
char* db_home=NULL; | |
int optind=1; | |
RsComparator rs_comparator; | |
while(optind < argc) | |
{ | |
if(strcmp(argv[optind],"-h")==0) | |
{ | |
cerr << argv[0] << ": Pierre Lindenbaum PHD. 2011." << endl; | |
return(EXIT_SUCCESS); | |
} | |
else if((strcmp(argv[optind],"-D")==0 || strcmp(argv[optind],"--db-home")==0) | |
&& optind+1< argc) | |
{ | |
db_home=argv[++optind]; | |
} | |
else if(strcmp(argv[optind],"--")==0) | |
{ | |
++optind; | |
break; | |
} | |
else if(argv[optind][0]=='-') | |
{ | |
cerr << "unknown option '"<<argv[optind] << "'" << endl; | |
return(EXIT_FAILURE); | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
if(db_home==NULL) | |
{ | |
cerr << "undefined db_home" << endl; | |
return(EXIT_FAILURE); | |
} | |
leveldb::DB* db=NULL; | |
leveldb::Options options; | |
options.comparator=&rs_comparator; | |
#ifdef READONLY | |
options.create_if_missing = false; | |
#else | |
options.create_if_missing = true; | |
#endif | |
clog << "[LOG]opening database" << endl; | |
leveldb::Status status = leveldb::DB::Open(options,db_home, &db); | |
//check status | |
if (!status.ok()) | |
{ | |
cerr << "cannot open " << db_home << " : " << status.ToString() << endl; | |
return EXIT_FAILURE; | |
} | |
#ifdef READONLY | |
/* dump all, using an iterator */ | |
if(optind==argc) | |
{ | |
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions()); | |
for (it->SeekToFirst(); it->Valid(); it->Next()) | |
{ | |
printFasta(it->key(),it->value()); | |
} | |
delete it; | |
} | |
else/* loop over the names */ | |
{ | |
while(optind< argc) | |
{ | |
string seqname(argv[optind++]); | |
string sequence; | |
clog << "[LOG]searching for "<< seqname << endl; | |
leveldb::Status status = db->Get(leveldb::ReadOptions(),seqname, &sequence); | |
if(!status.ok()) | |
{ | |
cerr << "Cannot find " << seqname<< " in " << db_home << endl; | |
continue; | |
} | |
printFasta(seqname,sequence); | |
} | |
} | |
#else | |
/* read from stdin */ | |
if(optind==argc) | |
{ | |
readFasta(db,gzdopen(fileno(stdin), "r")); | |
} | |
else | |
{ | |
/** loop over each fasta file */ | |
while(optind< argc) | |
{ | |
char* filename=argv[optind++]; | |
errno=0; | |
gzFile in=gzopen(filename,"rb"); | |
if(in==NULL) | |
{ | |
cerr << argv[0] | |
<< ": Cannot open "<< filename | |
<< ":" << strerror(errno) | |
<< endl; | |
exit(EXIT_FAILURE); | |
} | |
int ret=readFasta(db,in); | |
gzclose(in); | |
if(ret!=0) break; | |
} | |
} | |
#endif | |
/** close the leveldb database */ | |
clog << "[LOG]closing database" << endl; | |
delete db; | |
return EXIT_SUCCESS; | |
} |
That's it,
Pierre