31 July 2011

Storing some SNPs using the leveldb library. My notebook.

In this post I'll describe how I've used the leveldb API, a C++ key-value database, to store some SNPs (key=rs,value=sequence). "LevelDB is a fast key-value storage library written at Google that provides an ordered mapping from string keys to string values.". A benchmark for this engine is available here: http://code.google.com/p/leveldb/source/browse/trunk/doc/benchmark.html.

Download & install

$ svn checkout http://leveldb.googlecode.com/svn/trunk/ leveldb-read-only
$ cd leveldb-read-only/
$ make

Open & close a leveldb database

#include "leveldb/db.h"
(...)
leveldb::DB* db=NULL;
leveldb::Options options;
options.comparator=&rs_comparator;/* custom comparator for ordering the keys, see later */
options.create_if_missing = true;
clog << "[LOG]opening database" << endl;
leveldb::Status status = leveldb::DB::Open(options,db_home, &db);
//check status
if (!status.ok())
{
cerr << "cannot open " << db_home << " : " << status.ToString() << endl;
return EXIT_FAILURE;
}
(...)
/* use the database */
(...)
delete db;

A custom comparator for ordering the keys

Here, the keys are the rs-ids ordered on their numerical value. Both keys and values have a type "leveldb::Slice".
class RsComparator : public leveldb::Comparator
{
private:
/* parses the rsId: rs[0-9]+ */
int rs(const leveldb::Slice& s) const
{
int n=0;
for(size_t i=2;i< s.size();++i)
{
n=n*10+s[i]-'0';
}
return n;
}
public:
virtual int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const
{
return rs(a)-rs(b);
}
(...)
}
(...)
RsComparator rs_comparator;
options.comparator=&rs_comparator;

Inserting a FASTA sequence


(...)
std::string name;
std::string sequence;
(...)
leveldb::Status status = db->Put(leveldb::WriteOptions(), name, sequence);
if(!status.ok())
{
cerr << "Cannot insert "<< name << " "
<< status.ToString() << endl;
return EXIT_FAILURE;
}

Searching for a rs###

(...)
std::string name(argv[optind]);
std::string sequence;
(...)
leveldb::Status status = db->Get(leveldb::ReadOptions(),seqname, &sequence);
if(!status.ok())
{
cerr << "Cannot find " << seqname<< " in " << db_home << endl;
continue;
}
printFasta(seqname,sequence);
(...)

Dumping all the SNPs using an iterator

(...)
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
for (it->SeekToFirst(); it->Valid(); it->Next())
{
printFasta(it->key(),it->value());
}
delete it;
(...)

Examples

Reading the SNPs:
$ curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chMT.fas.gz" |\
./rsput -D snp.db

[LOG]opening database
[LOG] added rs8896
[LOG] added rs8936
[LOG] added rs9743
(...)
[LOG]closing database

$ du -h snp.db
336K snp.db
Dump the snps

$ ./rsget  -D snp.db

[LOG]opening database
>rs8896
GGTGTTGGTTCTCTTAATCTTTAACTTAAAAGGTTAATGCTAAGTTAGCTTTACAGTGGG
CTCTAGAGGGGGTAGAGGGGGTGYTATAGGGTAAATACGGGCCCTATTTCAAAGATTTTT
AGGGGAATTAATTCTAGGACGATGGGCATGAAACTGTGGTTTGCTCCACAGATTTCAGAG
CATT
>rs8936
ACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCG
ACCTGCGACTCCTTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTA
TAATAATTACATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAA
CAGATGCAATTCCCGGACGTHTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTAT
ACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAG
AATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGCACCCCCTCT
ACCCCCTCTAGAGCCCACTGTAAAGCTAACTTAGCATTAAC
>rs9743
CCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTAACCAACACAC
TAACCATATACCAATGATGNCGCGATGTAACACGAGAAAGCACATACCAAGGCCACCACA
CACCACCTGTCCAAAAAGGCCTTCGATACGGGATAATCCTATTTATTACCTCAGAANTTT
TTTTCTTCGCAGGATTTTTCTGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCCCAAN
(...)
[LOG]closing database
Search for some SNPs:
$ ./rsget -D snp.db rs78894381 rs72619361 rs25
[LOG]opening database
[LOG]searching for rs78894381
>rs78894381
CTACTAATCTCATCAACACAACCCCCGCCCATCCTACCCAGCACACACACACCGCTGCTA
ACCCCATACCCCGAACCAACCAAACCCCAAAGACACCCCCNCACAGTTTATGTAGCTTAC
CTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTCACATCACCCCATAAACAAATA
GGTTTGGTCCTAGCCTTTCTA
[LOG]searching for rs72619361
>rs72619361
ATGCATTTGGTATTTTAATCTGGGGGGTGTGCACGCGATAGCATTGTGAAACGCTGGCCC
CAGAGCACCCTATGTCGCAGTGTCTGTCTTTGATTCCTGCCYCATCCCATTATTGATCAC
ACCTACATTCAATATCCCAGGCGAGCATACCTATCACAAGGTGTTAATTAATTAATGCTT
GTAGGACATAACAATCAGTAAAC
[LOG]searching for rs25
Cannot find rs25 in snp.db
[LOG]closing database

Source code



CC=g++
LEVELDB=/your/path/to/leveldb-read-only
CFLAGS=-I$(LEVELDB)/include -Wall -O2
LDFLAGS=-L$(LEVELDB) -lz -lleveldb -lpthread
all:test
test:rsdb
curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chMT.fas.gz" |\
./rsput -D snp.db
./rsget -D snp.db
./rsget -D snp.db rs78894381 rs72619361 rs25
rsdb:rsput rsget
rsput:snp.c
$(CC) -o $@ $(CFLAGS) $< $(LDFLAGS)
rsget:snp.c
$(CC) -o $@ $(CFLAGS) -DREADONLY $< $(LDFLAGS)
view raw Makefile hosted with ❤ by GitHub
/**
* Author:
* Pierre Lindenbaum PhD
* Contact:
* plindenbaum@yahoo.fr
* WWW:
* http://plindenbaum.blogspot.com
* Reference:
* http://code.google.com/p/leveldb/
* Motivation:
* stores some SNPs using the leveldb library
* Usage:
* curl -s "ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chMT.fas.gz" | ./rsput -D snp.db
* ./rsget -D snp.db
* ./rsget -D snp.db rs78894381 rs72619361 rs25
*/
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <cassert>
#include <cerrno>
#include <zlib.h>
#include "leveldb/comparator.h"
#include "leveldb/db.h"
using namespace std;
/** compare the rs## on their numeric ID */
class RsComparator : public leveldb::Comparator
{
private:
int rs(const leveldb::Slice& s) const
{
int n=0;
for(size_t i=2;i< s.size();++i)
{
n=n*10+s[i]-'0';
}
return n;
}
public:
virtual int Compare(const leveldb::Slice& a, const leveldb::Slice& b) const
{
if(!(a.size()>2 && a[0]=='r' && a[1]=='s' && b.size()>2 && b[0]=='r' && b[1]=='s' ))
{
cerr << "error; expected two rs## but got " << a.ToString() << " and " << b.ToString() << endl;
exit(EXIT_FAILURE);
}
return rs(a)-rs(b);
}
virtual const char* Name() const { return "Rs#Comparator"; }
virtual void FindShortestSeparator(std::string*, const leveldb::Slice&) const { }
virtual void FindShortSuccessor(std::string*) const { }
};
#ifdef READONLY
static void printFasta(const leveldb::Slice& name,const leveldb::Slice& seq)
{
cout << ">";
cout.write(name.data(),name.size());
for(size_t i=0;i< seq.size();++i)
{
if(i%60==0) cout << endl;
cout << seq[i];
}
cout <<endl;
}
#else
static int readFasta(leveldb::DB* db,gzFile in)
{
string name;
string sequence;
sequence.reserve(1000000);
for(;;)
{
int c=gzgetc(in);
if(c==EOF || c=='>')
{
if(sequence.size()>0)
{
string::size_type i=name.find("rs=");
string::size_type j;
if(i!=string::npos && i+3< name.size() &&
isdigit(name[i+3]) &&
(j=name.find_first_not_of("0123456789",i+3))!=string::npos &&
name[j]=='|'
)
{
//remove head, tail and '=' in ...rs=98|...
name.erase(j).erase(0,i).erase(2,1);
}
else
{
cerr << "Cannot find rs=<digit>+ in "<< name << " " << endl;
return EXIT_FAILURE;
}
leveldb::Status status = db->Put(leveldb::WriteOptions(), name, sequence);
if(!status.ok())
{
cerr << "Cannot insert "<< name << " "
<< status.ToString() << endl;
return EXIT_FAILURE;
}
clog << "[LOG] added " << name << endl;
}
if(c==EOF) break;
sequence.clear();
name.clear();
while((c=gzgetc(in))!=EOF && c!='\n')
{
name+=(char)c;
}
continue;
}
else if(c=='#') //comments at the end of dbSNP fasta file!
{
while((c=gzgetc(in))!=EOF && c!='\n') ;
continue;
}
if(!isalpha(c)) continue;
sequence+=(char)c;
}
return EXIT_SUCCESS;
}
#endif
int main(int argc,char** argv)
{
char* db_home=NULL;
int optind=1;
RsComparator rs_comparator;
while(optind < argc)
{
if(strcmp(argv[optind],"-h")==0)
{
cerr << argv[0] << ": Pierre Lindenbaum PHD. 2011." << endl;
return(EXIT_SUCCESS);
}
else if((strcmp(argv[optind],"-D")==0 || strcmp(argv[optind],"--db-home")==0)
&& optind+1< argc)
{
db_home=argv[++optind];
}
else if(strcmp(argv[optind],"--")==0)
{
++optind;
break;
}
else if(argv[optind][0]=='-')
{
cerr << "unknown option '"<<argv[optind] << "'" << endl;
return(EXIT_FAILURE);
}
else
{
break;
}
++optind;
}
if(db_home==NULL)
{
cerr << "undefined db_home" << endl;
return(EXIT_FAILURE);
}
leveldb::DB* db=NULL;
leveldb::Options options;
options.comparator=&rs_comparator;
#ifdef READONLY
options.create_if_missing = false;
#else
options.create_if_missing = true;
#endif
clog << "[LOG]opening database" << endl;
leveldb::Status status = leveldb::DB::Open(options,db_home, &db);
//check status
if (!status.ok())
{
cerr << "cannot open " << db_home << " : " << status.ToString() << endl;
return EXIT_FAILURE;
}
#ifdef READONLY
/* dump all, using an iterator */
if(optind==argc)
{
leveldb::Iterator* it = db->NewIterator(leveldb::ReadOptions());
for (it->SeekToFirst(); it->Valid(); it->Next())
{
printFasta(it->key(),it->value());
}
delete it;
}
else/* loop over the names */
{
while(optind< argc)
{
string seqname(argv[optind++]);
string sequence;
clog << "[LOG]searching for "<< seqname << endl;
leveldb::Status status = db->Get(leveldb::ReadOptions(),seqname, &sequence);
if(!status.ok())
{
cerr << "Cannot find " << seqname<< " in " << db_home << endl;
continue;
}
printFasta(seqname,sequence);
}
}
#else
/* read from stdin */
if(optind==argc)
{
readFasta(db,gzdopen(fileno(stdin), "r"));
}
else
{
/** loop over each fasta file */
while(optind< argc)
{
char* filename=argv[optind++];
errno=0;
gzFile in=gzopen(filename,"rb");
if(in==NULL)
{
cerr << argv[0]
<< ": Cannot open "<< filename
<< ":" << strerror(errno)
<< endl;
exit(EXIT_FAILURE);
}
int ret=readFasta(db,in);
gzclose(in);
if(ret!=0) break;
}
}
#endif
/** close the leveldb database */
clog << "[LOG]closing database" << endl;
delete db;
return EXIT_SUCCESS;
}
view raw snp.c hosted with ❤ by GitHub



That's it,

Pierre

27 July 2011

Controlling IGV through a Port: my notebook.

A protocol for interacting with the The Integrative Genomics Viewer (IGV) is described here: http://www.broadinstitute.org/igv/PortCommands. So, you can save some pictures of a genomic region without interacting with IGV.

Via a shell script


$ cat script.sh

mkdir -p /tmp/igv
sleep 2
echo "snapshotDirectory /tmp/igv"
sleep 2
echo "goto chr3:100"
sleep 2
echo "snapshot"
sleep 2
echo "goto chr3:200-300"
sleep 2
echo "snapshot"
sleep 2

invoke the script
$ sh script.sh |  telnet 127.0.0.1 60151
Trying 127.0.0.1...
Connected to srv-clc-02.irt.univ-nantes.prive (127.0.0.1).
Escape character is '^]'.
INFO [2011-07-27 17:58:16,680] [CommandExecutor.java:124] [Thread-6] OK
OK
INFO [2011-07-27 17:58:18,685] [CommandExecutor.java:124] [Thread-6] OK
OK
INFO [2011-07-27 17:58:18,687] [MacroSnapshotAction.java:85] [Thread-6] Snapshot: chr3_80_119.png
INFO [2011-07-27 17:58:20,787] [CommandExecutor.java:124] [Thread-6] OK
OK
INFO [2011-07-27 17:58:22,792] [CommandExecutor.java:124] [Thread-6] OK
OK
INFO [2011-07-27 17:58:22,794] [MacroSnapshotAction.java:85] [Thread-6] Snapshot: chr3_200_300.png
Connection closed by foreign host.


Via java


The code
import java.io.*;
import java.net.*;
class Test
{
public static void main(String[] args) throws Exception
{
Socket socket = new Socket("127.0.0.1", 60151);
PrintWriter out = new PrintWriter(socket.getOutputStream(), true);
BufferedReader in = new BufferedReader(new InputStreamReader(socket.getInputStream()));


out.println("snapshotDirectory /tmp/igv");
System.out.println(in.readLine());
out.println("goto chr3:100");
System.out.println(in.readLine());

out.println("snapshot");
System.out.println(in.readLine());

in.close();
out.close();
socket.close();
}
}

Compile and execute:
$ javac Test
$ java Test
INFO [2011-07-27 18:01:15,706] [CommandExecutor.java:124] [Thread-6] OK
OK

INFO [2011-07-27 18:01:17,711] [CommandExecutor.java:124] [Thread-6] OK
OK

INFO [2011-07-27 18:01:17,729] [MacroSnapshotAction.java:85] [Thread-6] Snapshot: chr3_80_119.png
INFO [2011-07-27 18:01:20,533] [CommandExecutor.java:124] [Thread-6] OK
OK

26 July 2011

A mysql full-text parser searching for some SNPs

A mysql full-text parser server plugin can be used to replace or modify the built-in full-text parser.
This post a full-text parser plugin named bioparser that extracts the rs#### id from a text.

The source

The source of this plugin is available on github at:
https://github.com/lindenb/bioudf/blob/master/src/bioparser.c.

The work horse of the plugin is a simple function bioparser_parse scanning the SQL query or the TEXT. Each time a word starting with "rs" and followed by one or more number is found, it is added to the list of words to be find:
static int bioparser_parse(MYSQL_FTPARSER_PARAM *param)
{
char *curr=param->doc;
const char *begin=param->doc;
const char *end= begin + param->length;

param->flags = MYSQL_FTFLAGS_NEED_COPY;
while(curr+2<end)
{
if(tolower(*curr)=='r' &&
tolower(*(curr+1))=='s' &&
isdigit(*(curr+2)) &&
(curr==begin || IS_DELIM(*(curr-1) ) )
)
{
char* p=curr+2;
while(p!=end && isdigit(*p))
{
++p;
}
if(p==end || IS_DELIM(*p))
{
my_add_word(param,curr,p-curr);
}
curr=p;
}
else
{
curr++;
}
}

return 0;
}

Install the Plugin


mysql> INSTALL PLUGIN bioparser SONAME 'bioparser.so';
Query OK, 0 rows affected (0.00 sec)

mysql> show plugins;
+-----------------------+----------+--------------------+--------------+---------+
| Name | Status | Type | Library | License |
+-----------------------+----------+--------------------+--------------+---------+
(...)
| partition | ACTIVE | STORAGE ENGINE | NULL | GPL |
| bioparser | ACTIVE | FTPARSER | bioparser.so | GPL |
+-----------------------+----------+--------------------+--------------+---------+
21 rows in set (0.00 sec)

Invoke Plugin


create a table that will use the plugin:
mysql> create table pubmed(
abstract TEXT,
FULLTEXT (abstract) WITH PARSER bioparser
) ENGINE=MyISAM;

Insert some abstracts.
mysql> insert into pubmed(abstract) values("A predictive role in radiation pneumonitis (RP) development was observed for the LIG4 SNP rs1805388 (adjusted hazard ratio, 2.08; 95% confidence interval, 1.04-4.12; P = .037 for the CT/TT genotype vs the CC genotype). In addition, men with the TT genotype of the XRCC4 rs6869366 SNP and women with AG + AA genotypes of the XRCC5 rs3835 SNP also were at increased risk of developing severe RP.");
Query OK, 1 row affected (0.00 sec)

(...)

mysql> select abstract from pubmed;
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| abstract |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| A predictive role in radiation pneumonitis (RP) development was observed for the LIG4 SNP rs1805388 (adjusted hazard ratio, 2.08; 95% confidence interval, 1.04-4.12; P = .037 for the CT/TT genotype vs the CC genotype). In addition, men with the TT genotype of the XRCC4 rs6869366 SNP and women with AG + AA genotypes of the XRCC5 rs3835 SNP also were at increased risk of developing severe RP. |
| Nonhomologous end joining (NHEJ) is a pathway that repairs DNA double-strand breaks (DSBs) to maintain genomic stability in response to irradiation. The authors hypothesized that single nucleotide polymorphisms (SNPs) in NHEJ repair genes may affect clinical outcomes in patients with nonsmall cell lung cancer (NSCLC) who receive definitive radio(chemo)therapy. |
| The authors genotyped 5 potentially functional SNPs-x-ray repair complementing defective repair in Chinese hamster cells 4 (XRCC4) reference SNP (rs) number rs6869366 (-1394 guanine to thymine [-1394G?T] change) and rs28360071 (intron 3, deletion/insertion), XRCC5 rs3835 (guanine to adenine [G?A] change at nucleotide 2408), XRCC6 rs2267437 (-1310 cytosine to guanine [C?G) change], and DNA ligase IV (LIG4) rs1805388 (threonine-to-isoleucine change at codon 9 [T9I])-and estimated their associations with severe radiation pneumonitis (RP) (grade ?3) in 195 patients with NSCLC. |
| The current results indicated that NHEJ genetic polymorphisms, particularly LIG4 rs1805388, may modulate the risk of RP in patients with NSCLC who receive definitive radio(chemo)therapy. Large studies will be needed to confirm these findings. |
| The repair of DNA double-strand breaks (DSBs) is the major mechanism to maintain genomic stability in response to irradiation. We hypothesized that genetic polymorphisms in DSB repair genes may affect clinical outcomes among non-small cell lung cancer (NSCLC) patients treated with definitive radio(chemo)therapy. |
| We also found that RAD51 -135G>C and XRCC2 R188H SNPs were independent prognostic factors for overall survival (adjusted HR?=?1.70, 95% CI, 1.14-2.62, P?=?0.009 for CG/CC vs. GG; and adjusted HR?=?1.70; 95% CI, 1.02-2.85, P?=?0.043 for AG vs. GG, respectively) and that the SNP-survival association was most pronounced in the presence of RP. |
| A total of 291 patients (145 male/146 female, mean age (± S.D.) 52.2 (± 13.1) years) with PsA were examined clinically, by standard laboratory tests and their DNA was genotyped for the SNP rs2476601 (PTPN22 +1858 C/T). Allelic frequencies were determined and compared with 725 controls. |
| this is a test rs2476601, rs1805388, rs3835 and rs25 |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
8 rows in set (0.00 sec)

Now, test the plugin:

mysql> select
concat(left(abstract,40),"...") as ABSTRACT,
match(abstract) against("DNA double-strand) as SCORE
from pubmed group by 2 HAVING SCORE!=0;
Empty set (0.00 sec)

mysql> select
concat(left(abstract,40),"...") as ABSTRACT,
match(abstract) against("rs25") as SCORE
from pubmed group by 2 HAVING SCORE!=0;
+---------------------------------------------+--------------------+
| ABSTRACT | SCORE |
+---------------------------------------------+--------------------+
| this is a test rs2476601, rs1805388, rs3... | 1.8603347539901733 |
+---------------------------------------------+--------------------+
1 row in set (0.01 sec)

mysql> select
concat(left(abstract,40),"...") as ABSTRACT,
match(abstract) against("rs2476601 rs1805388 rs6869366") as SCORE
from pubmed group by 2 HAVING SCORE!=0 order by 2 desc;
+---------------------------------------------+--------------------+
| ABSTRACT | SCORE |
+---------------------------------------------+--------------------+
| A total of 291 patients (145 male/146 fe... | 1.086121916770935 |
| A predictive role in radiation pneumonit... | 1.0619741678237915 |
| this is a test rs2476601, rs1805388, rs3... | 1.0502985715866089 |
| The authors genotyped 5 potentially func... | 1.0388768911361694 |
+---------------------------------------------+--------------------+
4 rows in set (0.00 sec)

mysql> select
concat(left(abstract,40),"...") as ABSTRACT,
match(abstract) against("rs25,rs2476601,rs1805388,rs6869366") as SCORE
from pubmed group by 2 HAVING SCORE!=0 order by 2 desc;
+---------------------------------------------+--------------------+
| ABSTRACT | SCORE |
+---------------------------------------------+--------------------+
| this is a test rs2476601, rs1805388, rs3... | 2.9106333255767822 |
| A total of 291 patients (145 male/146 fe... | 1.086121916770935 |
| A predictive role in radiation pneumonit... | 1.0619741678237915 |
| The authors genotyped 5 potentially func... | 1.0388768911361694 |
+---------------------------------------------+--------------------+
4 rows in set (0.00 sec)

uninstall the plugin


mysql> UNINSTALL PLUGIN bioparser;
Query OK, 0 rows affected (0.00 sec)


That's it,

Pierre

25 July 2011

The samtools C API: creating a WIG file for the genome coverage

I wrote a tool which uses the samtools API to generate a WIG file for the genome coverage. This tool, bam2wig, as well as some other tools using this library are available in the following git repository:


My code is based on this original C source hosted on: http://samtools.sourceforge.net/sam-exam.shtml. (I hope I understood this API as many functions/structures remain undocumented).

Example

#compile
$ export SAMDIR=/path/to/samtools-0.1.17
$ cd amtools-utilities
$ make
#invoke bam2wig
$ .bin/bam2wig -t /path/to/my/sample.bam "chr1:120408225-120558920" | head
track name="__TRACK_NAME__" description="__TRACK_DESC__" type="wiggle_0"
fixedStep chrom=chr1 start=120408225 step=1 span=1
1
1
1
2
2
3
4
5

Result

The wig file uploaded as a custom track in the UCSC genome browser:


That's it

Pierre

20 July 2011

Drawing a SVG timeline with the http://data.bnf.fr data

The National French Library has recently started to release
its data as RDF/XML. Here, I've played with the biographies of the famous French writers to create a simple timeline.

Unfortunately, this timeline is too large to be displayed here. :-)






However, here is the java code I used to generate this map as a SVG document:
/**
* Author:
* Pierre Lindenbaum PhD
* Date:
* July-2011
* Contact:
* plindenbaum@yahoo.fr
* Reference:
*
* WWW:
* http://plindenbaum.blogspot.com
* Motivation:
* timeline from http://data.bnf.fr
*
*/
import java.awt.Dimension;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;
import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.stream.ImageInputStream;
import javax.xml.XMLConstants;
import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class BNFTimeline
{
static final int ICON_SIZE=64;
static final int AUTHOR_HEIGHT=ICON_SIZE+12;
static final int MARGIN=5;
private static final String SVG="http://www.w3.org/2000/svg";
private static final String CubicWeb="http://www.logilab.org/2008/cubicweb";
private static final String HTML="http://www.w3.org/1999/xhtml";
private static Logger LOG=Logger.getLogger(BNFTimeline.class.getName());
private DocumentBuilder docBuilder=null;
private XPath xpath=null;
private Map<String, String> prefix2uri=new HashMap<String, String>();
private Double minDays=null;
private Double maxDays=null;
private static class Date implements Comparable<Date>
{
String literal;
int year;
Integer month;
Integer day;
@Override
public int compareTo(Date o)
{
double d= days()-o.days();
if(d!=0.0) return d<0?-1:1;
return 0;
}
public double days()
{
double v= year*365.25;
if(month!=null)
{
v+= (365.25/12.0)*month;
if(day!=null)
{
v+=day;
}
}
return v;
}
}
private class Author
{
String url;
String name;
String birthPlace;
Date birthDate;
String deathPlace;
Date deathDate;
String gender;
String shortBio;
String depiction;
Dimension iconSize;
int y;
public double x1()
{
return convertDate2Pixel(birthDate);
}
public double x2()
{
return convertDate2Pixel(deathDate);
}
void writeXML(XMLStreamWriter w) throws XMLStreamException
{
w.writeStartElement("a");
w.writeAttribute("xlink:href", this.url);
w.writeAttribute("xlink:target","_blank");
w.writeStartElement("g");
w.writeAttribute("title",String.valueOf(name));
w.writeAttribute("transform", "translate("+x1()+","+(MARGIN+y*(AUTHOR_HEIGHT+MARGIN))+")");
w.writeStartElement("rect");
w.writeAttribute("style", "fill:black;stroke:white;");
w.writeAttribute("height", String.valueOf(AUTHOR_HEIGHT));
w.writeAttribute("width", String.valueOf(x2()-x1()));
w.writeEndElement();//rect
double textLength=(x2()-x1())-(MARGIN/2);
int shift=MARGIN;
if(this.iconSize!=null)
{
w.writeEmptyElement("image");
w.writeAttribute("x", String.valueOf(MARGIN+(ICON_SIZE-this.iconSize.width)/2));
w.writeAttribute("y", String.valueOf(MARGIN+(ICON_SIZE-this.iconSize.height)/2));
w.writeAttribute("width", String.valueOf(this.iconSize.width));
w.writeAttribute("height", String.valueOf(this.iconSize.height));
w.writeAttribute("xlink:href",this.depiction);
shift+=(ICON_SIZE+MARGIN);
textLength-=(ICON_SIZE+MARGIN);
}
w.writeStartElement("g");
w.writeAttribute("transform", "translate("+shift+",0)");
w.writeAttribute("style", "stroke:white;fill:white;font-size:14pt;font-weight:normal;");
w.writeStartElement("text");
w.writeAttribute("x", "0");
w.writeAttribute("y", "18");
w.writeCharacters(this.name+" ("+birthDate.year+" / "+this.deathDate.year+")");
w.writeEndElement();
if(this.shortBio==null) this.shortBio="";
//note: 123 chars/600px
// 0.25char/px
String biography=shortBio;
int posY=40;
int maxCharParLine=(int)(textLength*0.2);
while(biography.length()>0 && posY+10 < AUTHOR_HEIGHT)
{
String s=biography;
if(s.length()>maxCharParLine) s=biography.substring(0,maxCharParLine);
w.writeStartElement("text");
w.writeAttribute("style", "font-size:50%;");
w.writeAttribute("x", "0");
w.writeAttribute("y", String.valueOf(posY));
w.writeCharacters(s);
w.writeEndElement();
posY+=11;
biography=biography.substring(s.length());
}
w.writeEndElement();//g
w.writeEndElement();//g
w.writeEndElement();//a
}
}
private BNFTimeline() throws Exception
{
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance();
f.setCoalescing(true);
f.setNamespaceAware(true);
f.setValidating(false);
f.setExpandEntityReferences(true);
f.setIgnoringComments(false);
f.setIgnoringElementContentWhitespace(true);
this.docBuilder=f.newDocumentBuilder();
this.docBuilder.setEntityResolver(new EntityResolver()
{
@Override
public InputSource resolveEntity(String publicId, String systemId)
throws SAXException, IOException
{
LOG.info("resolve "+publicId+" "+systemId);
return new InputSource(new StringReader(""));
}
});
this.prefix2uri.put("h", HTML);
this.prefix2uri.put("cubicweb", CubicWeb);
this.prefix2uri.put(XMLConstants.XML_NS_PREFIX, XMLConstants.XML_NS_URI);
this.prefix2uri.put(XMLConstants.XMLNS_ATTRIBUTE, XMLConstants.XMLNS_ATTRIBUTE_NS_URI);
this.prefix2uri.put("dc","http://purl.org/dc/terms/");
this.prefix2uri.put("owl","http://www.w3.org/2002/07/owl#");
this.prefix2uri.put("foaf","http://xmlns.com/foaf/0.1/");
this.prefix2uri.put("rdagroup2elements","http://RDVocab.info/ElementsGr2/");
this.prefix2uri.put("rdf","http://www.w3.org/1999/02/22-rdf-syntax-ns#");
this.prefix2uri.put("skos","http://www.w3.org/2004/02/skos/core#");
this.prefix2uri.put("xfoaf","http://www.foafrealm.org/xfoaf/0.1/");
XPathFactory xpathFactory=XPathFactory.newInstance();
this.xpath=xpathFactory.newXPath();
this.xpath.setNamespaceContext(new NamespaceContext()
{
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public Iterator getPrefixes(String namespaceURI)
{
return prefix2uri.keySet().iterator();
}
@Override
public String getPrefix(String ns)
{
for(String k:prefix2uri.keySet())
{
if(prefix2uri.get(k).equals(ns)) return k;
}
return null;
}
@Override
public String getNamespaceURI(String prefix)
{
String u=prefix2uri.get(prefix);
return (u!=null?u:XMLConstants.NULL_NS_URI);
}
});
}
private int getScreenWidthInPixel()
{
return 15000;
}
private double convertDate2Pixel(Date d)
{
return getScreenWidthInPixel()*((d.days()-minDays)/((double)this.maxDays-(double)this.minDays));
}
private void parse() throws Exception
{
Tidy tidy = new Tidy();
tidy.setXHTML(true);
File xmlFile=File.createTempFile("_tmp", ".xml");
xmlFile.deleteOnExit();
final String prefix="http://data.bnf.fr/";
int pageIndex=1;
XPathExpression expr=this.xpath.compile(".//h:li/h:a[@href]");
List<Author> authors=new ArrayList<Author>();
//scan each index
for(;;)
{
boolean found=false;
URL url=new URL("http://data.bnf.fr/liste-auteurs/page"+pageIndex);
LOG.info(url.toString());
FileOutputStream fout=new FileOutputStream(xmlFile);
InputStream in=url.openStream();
tidy.parse(in,fout);
in.close();
fout.flush();
fout.close();
Document dom=this.docBuilder.parse(xmlFile);
NodeList L=(NodeList)expr.evaluate(dom, XPathConstants.NODESET);
for(int i=0;i< L.getLength();++i)
{
String href=Element.class.cast(L.item(i)).getAttribute("href");
if(!href.startsWith(prefix)) continue;
if(!href.substring(prefix.length()).matches("[0-9]+/[a-z\\-A-Z_0-9]+/"))
{
LOG.info("ignoring "+href);
continue;
}
Author author=new Author();
author.url=href;
authors.add(author);
found=true;
}
in.close();
if(!found) break;
++pageIndex;
}
xmlFile.delete();
int index=0;
while(index< authors.size())
{
Author author=authors.get(index);
LOG.info(author.url+"rdf.xml");
Document dom=this.docBuilder.parse(author.url+"rdf.xml");
Element root=(Element)xpath.evaluate("rdf:RDF/rdf:Description[rdf:type/@rdf:resource='http://xmlns.com/foaf/0.1/Person']",dom,XPathConstants.NODE);
if(root==null)
{
authors.remove(index);
continue; //e.g. "Academie Fr"
}
author.name=(String)xpath.evaluate("dc:title[1]", root,XPathConstants.STRING);
author.birthDate= parseDate((String)xpath.evaluate("rdagroup2elements:dateOfBirth", root,XPathConstants.STRING));
author.birthPlace = (String)xpath.evaluate("rdagroup2elements:placeOfBirth", root,XPathConstants.STRING);
author.deathDate = parseDate((String)xpath.evaluate("rdagroup2elements:dateOfDeath", root,XPathConstants.STRING));
author.deathPlace = (String)xpath.evaluate("rdagroup2elements:placeOfDeath", root,XPathConstants.STRING);
author.gender = (String)xpath.evaluate("foaf:gender", root,XPathConstants.STRING);
author.shortBio = (String)xpath.evaluate("rdagroup2elements:biographicalInformation", root,XPathConstants.STRING);
author.depiction=(String)xpath.evaluate("foaf:depiction/@rdf:resource",root,XPathConstants.STRING);
if(author.birthDate==null || author.deathDate==null
|| author.deathDate.year<1400 || author.birthDate.year<1400)//TODO
{
authors.remove(index);
continue;
}
if(author.depiction!=null && !author.depiction.trim().isEmpty())
{
author.iconSize=getDepictionSize(author.depiction);
}
if(this.minDays==null || author.birthDate.days()<this.minDays)
{
this.minDays= author.birthDate.days();
}
if(this.maxDays==null || author.deathDate.days()>this.maxDays)
{
this.maxDays= author.deathDate.days();
}
++index;
}
this.minDays-=360;
this.maxDays+=360;
//sort persons on birth-date/death-date
Collections.sort(authors, new Comparator<Author>()
{
@Override
public int compare(Author o1, Author o2)
{
int i=o1.birthDate.compareTo(o2.birthDate);
if(i!=0) return i;
return o1.deathDate.compareTo(o2.deathDate);
}
});
List<Author> remains=new ArrayList<Author>(authors);
int nLine=-1;
while(!remains.isEmpty())
{
++nLine;
Author first=remains.remove(0);
first.y=nLine;
while(true)
{
Author best=null;
int bestIndex=-1;
for(int i=0;i< remains.size();++i)
{
Author next=remains.get(i);
if(next.x1()< first.x2()+5) continue;
if(best==null ||
(next.x1()-first.x2() < best.x1()-first.x2()))
{
best=next;
bestIndex=i;
}
}
if(best==null) break;
first=best;
first.y=nLine;
remains.remove(bestIndex);
}
}
FileOutputStream fout=new FileOutputStream("output.svg");
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance();
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(fout,"UTF-8");
w.writeStartDocument("UTF-8","1.0");
w.writeStartElement("svg");
w.writeAttribute("xmlns", SVG);
w.writeAttribute("xmlns:xlink","http://www.w3.org/1999/xlink");
w.writeAttribute("version","1.1");
w.writeAttribute("width",String.valueOf(getScreenWidthInPixel()));
w.writeAttribute("height",String.valueOf(MARGIN+((nLine+1)*(AUTHOR_HEIGHT+MARGIN))));
w.writeAttribute("style", "fill:none;stroke:black;stroke-width:1px;");
w.writeEmptyElement("rect");
w.writeAttribute("x","0");
w.writeAttribute("y","0");
w.writeAttribute("width",String.valueOf(getScreenWidthInPixel()-1));
w.writeAttribute("height",String.valueOf(MARGIN+((nLine+1)*(AUTHOR_HEIGHT+MARGIN))-1));
w.writeAttribute("style", "fill:lightgray;stroke:black;");
for(Author author:authors)
{
author.writeXML(w);
}
w.writeEndDocument();//svg
w.close();
fout.flush();
fout.close();
}
private Dimension getDepictionSize(String resourceFile) throws Exception
{
BufferedImage img=ImageIO.read(new URL(resourceFile));
Dimension d= new Dimension(img.getWidth(),img.getHeight());
if(d.getWidth()< d.getHeight())
{
double ratio= d.getWidth()/(double)d.getHeight();//<0
int len= (int)(ICON_SIZE*ratio);
d.width=len;
d.height=ICON_SIZE;
}
else
{
double ratio= d.getHeight()/(double)d.getWidth();//<0
int len= (int)(ICON_SIZE*ratio);
d.height=len;
d.width=ICON_SIZE;
}
return d;
}
private Date parseDate(String s)
{
if(s==null || s.isEmpty()) return null;
Date d=new Date();
d.literal=s;
s=s.trim();
if(s.startsWith("-"))
{
return null;
}
if(s.matches("[0-3][0-9]\\-[0-1][0-9]\\-[0-9][0-9][0-9][0-9]"))
{
String tokens[]=s.split("[\\-]");
d.day=Integer.parseInt(tokens[0]);
d.month=Integer.parseInt(tokens[1]);
d.year=Integer.parseInt(tokens[2]);
}
else if(s.matches("[0-1][0-9]\\-[0-9][0-9][0-9][0-9]"))
{
String tokens[]=s.split("[\\-]");
d.month=Integer.parseInt(tokens[0]);
d.year=Integer.parseInt(tokens[1]);
}
else if(s.matches("[0-9]{1,4}"))
{
d.year=Integer.parseInt(s);
}
else
{
return null;
}
return d;
}
public static void main(String[] args) {
try
{
BNFTimeline app=new BNFTimeline();
int optind=0;
while(optind< args.length)
{
if(args[optind].equals("-h") ||
args[optind].equals("-help") ||
args[optind].equals("--help"))
{
System.err.println("Options:");
System.err.println(" -h help; This screen.");
return;
}
else if(args[optind].equals("-L"))
{
}
else if(args[optind].equals("--"))
{
optind++;
break;
}
else if(args[optind].startsWith("-"))
{
System.err.println("Unknown option "+args[optind]);
return;
}
else
{
break;
}
++optind;
}
if(optind!=args.length)
{
System.err.println("Illegal number of arguments.");
return;
}
app.parse();
}
catch(Throwable err)
{
err.printStackTrace();
}
}
}


See also: Freebase and the History of Sciences

That's it,

Pierre