31 July 2011

Storing some SNPs using the leveldb library. My notebook.

In this post I'll describe how I've used the leveldb API, a C++ key-value database, to store some SNPs (key=rs,value=sequence). "LevelDB is a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.". A benchmark for this engine is available here: http://code.google.com/p/leveldb/source/browse/trunk/doc/benchmark.html.

Download & install

$ svn checkout http://leveldb.googlecode.com/svn/trunk/ leveldb-read-only
$ cd leveldb-read-only/
$ make

Open & close a leveldb database

#include "leveldb/db.h"
(...)
leveldb::DB* db=NULL;
leveldb::Options options;
options.comparator=&rs_comparator;/* custom comparator for ordering the keys, see later */
options.create_if_missing = true;
clog << "[LOG]opening database" << endl;
leveldb::Status status = leveldb::DB::Open(options,db_home, &db);
//check status
if (!status.ok())
{
cerr << "cannot open " << db_home << " : " << status.ToString() << endl;
return EXIT_FAILURE;
}
(...)
/* use the database */
(...)
delete db;

A custom comparator for ordering the keys

Here, the keys are the rs-ids ordered on their numerical value. Both keys and values have a type "leveldb::Slice".
class RsComparator : public leveldb::Comparator
{
private:
/* parses the rsId: rs[0-9]+ */
int rs(const leveldb::Slice& s) const
{
int n=0;
for(size_t i=2;i< s.size();++i)
{
n=n*10+s[i]-'0';
}
return n;
}
public:
virtual int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const
{
return rs(a)-rs(b);
}
(...)
}
(...)
RsComparator rs_comparator;
options.comparator=&rs_comparator;

Inserting a FASTA sequence


(...)
std::string name;
std::string sequence;
(...)
leveldb::Status status = db->Put(leveldb::WriteOptions(), name, sequence);
if(!status.ok())
{
cerr << "Cannot insert "<< name << " "
<< status.ToString() << endl;
return EXIT_FAILURE;
}

Searching for a rs###

(...)
std::string name(argv[optind]);
std::string sequence;
(...)
leveldb::Status status = db->Get(leveldb::ReadOptions(),seqname, &sequence);
if(!status.ok())
{
cerr << "Cannot find " << seqname<< " in " << db_home << endl;
continue;
}
printFasta(seqname,sequence);
(...)

Dumping all the SNPs using an iterator

(...)
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
for (it->SeekToFirst(); it->Valid(); it->Next())
{
printFasta(it->key(),it->value());
}
delete it;
(...)

Examples

Reading the SNPs:
$ curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chMT.fas.gz" |\
./rsput -D snp.db

[LOG]opening database
[LOG] added rs8896
[LOG] added rs8936
[LOG] added rs9743
(...)
[LOG]closing database

$ du -h snp.db
336K snp.db
Dump the snps

$ ./rsget  -D snp.db

[LOG]opening database
>rs8896
GGTGTTGGTTCTCTTAATCTTTAACTTAAAAGGTTAATGCTAAGTTAGCTTTACAGTGGG
CTCTAGAGGGGGTAGAGGGGGTGYTATAGGGTAAATACGGGCCCTATTTCAAAGATTTTT
AGGGGAATTAATTCTAGGACGATGGGCATGAAACTGTGGTTTGCTCCACAGATTTCAGAG
CATT
>rs8936
ACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCG
ACCTGCGACTCCTTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTA
TAATAATTACATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAA
CAGATGCAATTCCCGGACGTHTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTAT
ACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAG
AATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCT
ACCCCCTCTAGAGCCCACTGTAAAGCTAACTTAGCATTAAC
>rs9743
CCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTAACCAACACAC
TAACCATATACCAATGATGNCGCGATGTAACACGAGAAAGCACATACCAAGGCCACCACA
CACCACCTGTCCAAAAAGGCCTTCGATACGGGATAATCCTATTTATTACCTCAGAANTTT
TTTTCTTCGCAGGATTTTTCTGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCCCAAN
(...)
[LOG]closing database
Search for some SNPs:
$ ./rsget -D snp.db rs78894381 rs72619361 rs25
[LOG]opening database
[LOG]searching for rs78894381
>rs78894381
CTACTAATCTCATCAACACAACCCCCGCCCATCCTACCCAGCACACACACACCGCTGCTA
ACCCCATACCCCGAACCAACCAAACCCCAAAGACACCCCCNCACAGTTTATGTAGCTTAC
CTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAAATA
GGTTTGGTCCTAGCCTTTCTA
[LOG]searching for rs72619361
>rs72619361
ATGCATTTGGTATTTTAATCTGGGGGGTGTGCACGCGATAGCATTGTGAAACGCTGGCCC
CAGAGCACCCTATGTCGCAGTGTCTGTCTTTGATTCCTGCCYCATCCCATTATTGATCAC
ACCTACATTCAATATCCCAGGCGAGCATACCTATCACAAGGTGTTAATTAATTAATGCTT
GTAGGACATAACAATCAGTAAAC
[LOG]searching for rs25
Cannot find rs25 in snp.db
[LOG]closing database

Source code



CC=g++
LEVELDB=/your/path/to/leveldb-read-only
CFLAGS=-I$(LEVELDB)/include -Wall -O2
LDFLAGS=-L$(LEVELDB) -lz -lleveldb -lpthread
all:test
test:rsdb
curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chMT.fas.gz" |\
./rsput -D snp.db
./rsget -D snp.db
./rsget -D snp.db rs78894381 rs72619361 rs25
rsdb:rsput rsget
rsput:snp.c
$(CC) -o $@ $(CFLAGS) $< $(LDFLAGS)
rsget:snp.c
$(CC) -o $@ $(CFLAGS) -DREADONLY $< $(LDFLAGS)
view raw Makefile hosted with ❤ by GitHub
/**
* Author:
* Pierre Lindenbaum PhD
* Contact:
* plindenbaum@yahoo.fr
* WWW:
* http://plindenbaum.blogspot.com
* Reference:
* http://code.google.com/p/leveldb/
* Motivation:
* stores some SNPs using the leveldb library
* Usage:
* curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chMT.fas.gz" | ./rsput -D snp.db
* ./rsget -D snp.db
* ./rsget -D snp.db rs78894381 rs72619361 rs25
*/
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <cassert>
#include <cerrno>
#include <zlib.h>
#include "leveldb/comparator.h"
#include "leveldb/db.h"
using namespace std;
/** compare the rs## on their numeric ID */
class RsComparator : public leveldb::Comparator
{
private:
int rs(const leveldb::Slice& s) const
{
int n=0;
for(size_t i=2;i< s.size();++i)
{
n=n*10+s[i]-'0';
}
return n;
}
public:
virtual int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const
{
if(!(a.size()>2 && a[0]=='r' && a[1]=='s' && b.size()>2 && b[0]=='r' && b[1]=='s' ))
{
cerr << "error; expected two rs## but got " << a.ToString() << " and " << b.ToString() << endl;
exit(EXIT_FAILURE);
}
return rs(a)-rs(b);
}
virtual const char* Name() const { return "Rs#Comparator"; }
virtual void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
virtual void FindShortSuccessor(std::string*) const { }
};
#ifdef READONLY
static void printFasta(const leveldb::Slice& name,const leveldb::Slice& seq)
{
cout << ">";
cout.write(name.data(),name.size());
for(size_t i=0;i< seq.size();++i)
{
if(i%60==0) cout << endl;
cout << seq[i];
}
cout <<endl;
}
#else
static int readFasta(leveldb::DB* db,gzFile in)
{
string name;
string sequence;
sequence.reserve(1000000);
for(;;)
{
int c=gzgetc(in);
if(c==EOF || c=='>')
{
if(sequence.size()>0)
{
string::size_type i=name.find("rs=");
string::size_type j;
if(i!=string::npos && i+3< name.size() &&
isdigit(name[i+3]) &&
(j=name.find_first_not_of("0123456789",i+3))!=string::npos &&
name[j]=='|'
)
{
//remove head, tail and '=' in ...rs=98|...
name.erase(j).erase(0,i).erase(2,1);
}
else
{
cerr << "Cannot find rs=<digit>+ in "<< name << " " << endl;
return EXIT_FAILURE;
}
leveldb::Status status = db->Put(leveldb::WriteOptions(), name, sequence);
if(!status.ok())
{
cerr << "Cannot insert "<< name << " "
<< status.ToString() << endl;
return EXIT_FAILURE;
}
clog << "[LOG] added " << name << endl;
}
if(c==EOF) break;
sequence.clear();
name.clear();
while((c=gzgetc(in))!=EOF && c!='\n')
{
name+=(char)c;
}
continue;
}
else if(c=='#') //comments at the end of dbSNP fasta file!
{
while((c=gzgetc(in))!=EOF && c!='\n') ;
continue;
}
if(!isalpha(c)) continue;
sequence+=(char)c;
}
return EXIT_SUCCESS;
}
#endif
int main(int argc,char** argv)
{
char* db_home=NULL;
int optind=1;
RsComparator rs_comparator;
while(optind < argc)
{
if(strcmp(argv[optind],"-h")==0)
{
cerr << argv[0] << ": Pierre Lindenbaum PHD. 2011." << endl;
return(EXIT_SUCCESS);
}
else if((strcmp(argv[optind],"-D")==0 || strcmp(argv[optind],"--db-home")==0)
&& optind+1< argc)
{
db_home=argv[++optind];
}
else if(strcmp(argv[optind],"--")==0)
{
++optind;
break;
}
else if(argv[optind][0]=='-')
{
cerr << "unknown option '"<<argv[optind] << "'" << endl;
return(EXIT_FAILURE);
}
else
{
break;
}
++optind;
}
if(db_home==NULL)
{
cerr << "undefined db_home" << endl;
return(EXIT_FAILURE);
}
leveldb::DB* db=NULL;
leveldb::Options options;
options.comparator=&rs_comparator;
#ifdef READONLY
options.create_if_missing = false;
#else
options.create_if_missing = true;
#endif
clog << "[LOG]opening database" << endl;
leveldb::Status status = leveldb::DB::Open(options,db_home, &db);
//check status
if (!status.ok())
{
cerr << "cannot open " << db_home << " : " << status.ToString() << endl;
return EXIT_FAILURE;
}
#ifdef READONLY
/* dump all, using an iterator */
if(optind==argc)
{
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
for (it->SeekToFirst(); it->Valid(); it->Next())
{
printFasta(it->key(),it->value());
}
delete it;
}
else/* loop over the names */
{
while(optind< argc)
{
string seqname(argv[optind++]);
string sequence;
clog << "[LOG]searching for "<< seqname << endl;
leveldb::Status status = db->Get(leveldb::ReadOptions(),seqname, &sequence);
if(!status.ok())
{
cerr << "Cannot find " << seqname<< " in " << db_home << endl;
continue;
}
printFasta(seqname,sequence);
}
}
#else
/* read from stdin */
if(optind==argc)
{
readFasta(db,gzdopen(fileno(stdin), "r"));
}
else
{
/** loop over each fasta file */
while(optind< argc)
{
char* filename=argv[optind++];
errno=0;
gzFile in=gzopen(filename,"rb");
if(in==NULL)
{
cerr << argv[0]
<< ": Cannot open "<< filename
<< ":" << strerror(errno)
<< endl;
exit(EXIT_FAILURE);
}
int ret=readFasta(db,in);
gzclose(in);
if(ret!=0) break;
}
}
#endif
/** close the leveldb database */
clog << "[LOG]closing database" << endl;
delete db;
return EXIT_SUCCESS;
}
view raw snp.c hosted with ❤ by GitHub



That's it,

Pierre

1 comment:

adamtha said...

Thanks, looks promising
I'm always looking for a persistent and friendly ways to store some key-value pairs
Performance wise looks very nice though tests presented were a bit small so ill just grab it and have a look
again thanks for sharing and for your cool posts!

regards
adam