06 September 2011

Parsing a BAM file with javascript, yes we can. (Node.js and V8)

Node.js is an event-driven I/O server-side JavaScript environment based on V8, Google's open source JavaScript engine. In the current post I will describe how I've used Node/V8 to parse a BAM file. Here I've used node v0.5.5 and my code is hosted in a new git repository bionode.

Designing a C++ native extension for Node.js wrapping the bgzf format

BAM files are stored using the bgzf format. We must first create a C++ extension wrapping the methods related to bgzf. This process is nicely described in "Writing Node.js Native Extensions". Here, a BGZF* pointer is wrapped into a class BGZFSupport that extends v8::ObjectWrapp:
class BGZFSupport: public ObjectWrap
 {
 private:
   BGZF* file;
 public:
    (...)
   int close()
    {
    int ret=0;
    if(file!=NULL) ret=::bgzf_close(file);
    file=NULL;
    return ret;
    }
   ~BGZFSupport()
    {
   if(file!=NULL) ::bgzf_close(file);
    }
 (...)
The javascript constructor for BGZFSupport opens the bgzfile and is implemented on the C++ side as:
  static Handle<Value> New(const Arguments& args)
    {
    HandleScope scope;
    if (args.Length() < 2)
      {
      RETURN_THROW("Expected two parameters for bgfz");
      }
    if(!args[0]->IsString())
     {
     RETURN_THROW("1st argument is not a string");
     }
    if(!args[1]->IsString())
     {
     RETURN_THROW("2nd argument is not a string");
     }
    
    v8::String::Utf8Value filename(args[0]);
    v8::String::Utf8Value mode(args[1]);
    BGZF* file= ::bgzf_open(ToCString(filename),ToCString(mode));
    if(file==NULL)
     {
     RETURN_THROW("Cannot open \"" << ToCString(filename) <<  "\"");
     }
    BGZFSupport* instance = new BGZFSupport(file);
    instance->Wrap(args.This());
    return args.This();
    }
... and so on for the other functions...

Implementing the javascript-based BAM-Reader

Next, we can embbed this BGZFSupport in a javascript file that will read a BAM file:
var bgzf=require("bgzf");
and we create a javascript class/function BamReader that will open the file as bgzf and will read the BAM header:
var bgzf=require("bgzf");
var Buffer = require('buffer').Buffer;


function BamReader(path)
 {
 this.fd= new bgzf.bgzf(path,"r");
 var b=new Buffer(4);
 var n = this.fd.read(b,0,4);
 if(n!=4) throw new Error("Cannot read 4 bytes");
 if(b[0]!=66)  throw new Error("Error MAGIC[0]");
 if(b[1]!=65)  throw new Error("Error MAGIC[1] got"+b[1]);
 if(b[2]!="M".charCodeAt(0))  throw new Error("Error MAGIC[2]");
 if(b[3]!="\1".charCodeAt(0))  throw new Error("Error MAGIC[3]");
 
 /* l_text */
 n = this.fd.read(b,0,4);
 if(n!=4) throw new Error("Cannot read 4 bytes");
 var l_text=b.readInt32LE(0);
 b=new Buffer(l_text);
 n = this.fd.read(b,0,l_text);
 if(n!=l_text) throw new Error("Cannot read "+l_text+" bytes (l_text)");
 this.text=b.toString('utf-8', 0, l_text);
 
 /* n_seq */
 b=new Buffer(4);
 n = this.fd.read(b,0,4);
 if(n!=4) throw new Error("Cannot read 4 bytes");
 var n_ref=b.readInt32LE(0);
 this.references=[];
 this.name2seq={};
 for(var i=0;i< n_ref;++i)
  {
  var refseq={};
  /* l_name */
  b=new Buffer(4);
  n = this.fd.read(b,0,4);
  if(n!=4) throw new Error("Cannot read 4 bytes");
  var l_name=b.readInt32LE(0);
  /* name */
  b=new Buffer(l_name);
  n = this.fd.read(b,0,l_name);
  if(n!=l_name) throw new Error("Cannot read "+l_name+" bytes (name)");
  refseq.name=b.toString('utf-8', 0,l_name-1);//\0 terminated
  /* l_ref */
  b=new Buffer(4);
  n = this.fd.read(b,0,4);
  if(n!=4) throw new Error("Cannot read 4 bytes");
  refseq.l_ref=b.readInt32LE(0);
  this.references.push(refseq);
  this.name2seq[refseq.name]=refseq;
  }
 //console.log(this.name2seq);
 }
Another function next() reads the next alignment or returns null ( see the code ).

Testing

$ export NODE_PATH=/path/to/bionode/build

the script reads a simple BAM file and prints the positions of the reads:
(...)

var r= new BamReader("/path/to/samtools-0.1.17/examples/toy.bam");
var align;
while((align=r.next())!=null)
 {
 console.log(
  r.references[align.refID].name+"\t"+
  align.read_name+"\t"+
  align.pos
  );
 }
r.close();

Result

$ node bgzf.js
ref r001 6
ref r002 8
ref r003 8
ref r004 15
ref r003 28
ref r001 36
ref2 x1 0
ref2 x2 1
ref2 x3 5
ref2 x4 9
ref2 x5 11
ref2 x6 13


Remaining questions:

At the moment, I don't know how to correctly package the C++ and javascript files for node.js, how to correctly include the files, how to group the different files under a common 'namespace', etc...

That's It,
Pierre

1 comment:

jeffhsu3 said...

Dalliance genome browser implements a pure client-side javascript bam reader, which I thought was really cool:
Dalliance
Dalliance Github


js zlib library!

Citation: Down TA, Piipari M, Hubbard TJ., Dalliance: interactive genome viewing on the web. Bioinformatics 2011 Jan 19