19 April 2010

A stateful C function for R: parsing Fasta sequences

In the following post, I'll create a C extension for R. This extension will iterate over all the FASTA sequences in a file and will return a pair(name,sequence) for each sequence, that is to say that I won't store all the sequences in memory.

The C code

The structure FastaHandler holds the state of the fasta parser. It contains a pointer to the FILE, the current sequence and fasta header. We also need to save the previous line.
/** stateful structure for the fastafile */
typedef struct fastaHandler_t
{
/** input file */
FILE* in;
/** save the previous line, it will contains the next fasta header */
char* previous_line;
/** sequence name */
char *seq_name;
/** sequence dna */
char *seq_dna;
/** sequence_length */
int seq_length;
}FastaHandler,*FastaHandlerPtr;

The function fasta_open opens the fasta file and initialize a new FastaHandlerPtr. This pointer is then persisted into a 'R' variable using R_MakeExternalPtr.
/**
* open a fasta file, init the FastaHandler
*/
SEXP fasta_open(SEXP filename)
{
FastaHandlerPtr handle=NULL;
if(!isString(filename)) error("filename is not a string");
if(length(filename)!=1) error("expected only one filename");

handle=(FastaHandlerPtr)calloc(1,sizeof(FastaHandler));
if(handle==NULL)
{
error("Cannot alloc FastaHandler");
}
const char* c_filename=CHAR(STRING_ELT(filename,0));
errno=0;
handle->in= fopen( c_filename,"r");
if(handle->in==NULL)
{
error("Cannot open \"%s\": %s",c_filename,strerror(errno));
}
/** the handle is bound a R variable */
return R_MakeExternalPtr(handle, R_NilValue, R_NilValue);
}
We also need a function to close the structure. The pointer to the FastaHandlerPtr is retrieved using R_ExternalPtrAddr(R_variable):
/**
* close the FastaHandlerPtr
*/
SEXP fasta_close(SEXP r_handle)
{
FastaHandlerPtr handle = R_ExternalPtrAddr(r_handle);;
if(handle==NULL) error("handle==NULL");
if(handle->in!=NULL)
{
fclose(handle->in);
handle->in=NULL;
}
if(handle->previous_line!=NULL)
{
free(handle->previous_line);
handle->previous_line=NULL;
}
free(handle->seq_name);
handle->seq_name=NULL;
free(handle->seq_dna);
handle->seq_dna=NULL;
R_ClearExternalPtr(r_handle);
return ScalarInteger(0);
}
The function fasta_next return the next pair(name,sequence):
/**
* read the next fasta sequence
*/
SEXP fasta_next(SEXP r_handle)
{
int count=0;
char* ptr=NULL;
FastaHandlerPtr handle = R_ExternalPtrAddr(r_handle);;
if(handle==NULL) error("handle==NULL");
if(handle->in==NULL) error("handle->in==NULL");

while((ptr=readLine(handle))!=NULL)
{
if(ptr[0]=='>')
{
//this is the header of a fasta sequence
if(handle->seq_name!=NULL || handle->seq_dna!=NULL)
{
handle->previous_line=ptr;
SEXP return_value=NULL;
//if there a sequence in memory ?
if( handle->seq_name!=NULL &&
handle->seq_dna!=NULL &&
handle->seq_length>0)
{
//create the sequence
return_value=make_sequence(
&handle->seq_name[1],
handle->seq_dna
);
}
free(handle->seq_name);
free(handle->seq_dna);
handle->seq_name=NULL;
handle->seq_dna=NULL;
handle->seq_length=0;
//return the sequence if any
if(return_value!=NULL) return return_value;
}
//cleanup
handle->seq_name=ptr;
handle->seq_length=0;
handle->seq_dna=NULL;
}
else
{
int j=0;
int len=0;
//remove blank characters
while(ptr[j]!=0)
{
if(!isspace(ptr[j]))
{
ptr[len++]=ptr[j];
}
++j;
}
//enlarge the dna sequence
handle->seq_dna=realloc(
handle->seq_dna,
sizeof(char)*(handle->seq_length+1+len)
);
if(handle->seq_dna==NULL) error("cannot realloc seq_dna");
//append the new line
memcpy(&handle->seq_dna[handle->seq_length],ptr,sizeof(char)*len);
handle->seq_length+=len;
handle->seq_dna[handle->seq_length]=0;
free(ptr);
ptr=NULL;
}
}
//last sequence
SEXP return_value=NULL;
if( handle->seq_name!=NULL &&
handle->seq_dna!=NULL &&
handle->seq_length>0)
{
return_value=make_sequence(
&handle->seq_name[1],
handle->seq_dna
);
}
free(handle->seq_name);
free(handle->seq_dna);
handle->seq_name=NULL;
handle->seq_dna=NULL;
handle->seq_length=0;
return (return_value==NULL?R_NilValue:return_value);
}
This function calls readLine reading the next non-empty line:
/**
* return the next non empty line from a FastaHandlerPtr
* should be free()
*/
static char* readLine(FastaHandlerPtr handle)
{
int c;
char* ptr=NULL;
int length=0;
/* if there was a saved line, return it */
if(handle->previous_line!=NULL)
{
ptr=handle->previous_line;
handle->previous_line=NULL;
return ptr;
}
/** while the line is empty */
while(length==0)
{
int capacity=0;
//no more to read ?
if(feof(handle->in)) return NULL;
ptr=NULL;
//read each char
while((c=fgetc(handle->in))!=EOF)
{
//if this is an EOL, break
if(c=='\n') break;
//enlarge the buffer if it is too small
if(ptr==NULL || (length+2)>=capacity)
{
capacity+=500;
ptr=(char*)realloc(ptr,capacity*sizeof(char));
if(ptr==NULL) error("cannot realloc to %d bytes",capacity);
}
//append the char to the buffer
ptr[(length)++]=c;
}
//nothing was read (empty line) ? continue
if(length==0)
{
free(ptr);
ptr=NULL;
}
}
//add a end-of-string char
ptr[length]=0;
//return the line
return ptr;
}
It also invokes the function make_sequence(name,dna). This function creates a new R pair(name,sequence) :
/** create a pair list containing the name and the dna */
static SEXP make_sequence(const char* seq_name,const char* length)
{
SEXP values,names;
//the R value contains two objects
PROTECT(values = allocVector(VECSXP, 2));
//first item is the sequence name
SET_VECTOR_ELT(values, 0, mkString(seq_name));
//second item is the sequence dna
SET_VECTOR_ELT(values, 1, mkString(length));

//the labels
PROTECT(names = allocVector(STRSXP, 2));
//first label is the name
SET_STRING_ELT(names, 0, mkChar("name"));
//2nd label is the name
SET_STRING_ELT(names, 1, mkChar("sequence"));

//bind the labels to the values
setAttrib(values, R_NamesSymbol, names);
UNPROTECT(2);
return values;
}

The 'R' code

The 'C' dynamic library is loaded and each C function is bound to R using ".Call":
dyn.load(paste("libfasta", .Platform$dynlib.ext, sep=""))

fasta.open <- function(filename)
{
.Call("fasta_open", filename)
}

fasta.close <- function(handler)
{
.Call("fasta_close",handler)
}

fasta.next <- function(handler)
{
.Call("fasta_next",handler)
}

Compiling

Here is my makefile:
run:libfasta.so rs_chLG1.fas
${R_HOME}/bin/R --no-save < fasta.R

rs_chLG1.fas:
wget -O $@.gz ftp://ftp.ncbi.nih.gov/snp/organisms/bee_7460/rs_fasta/rs_chLG1.fas.gz
gunzip $@.gz

libfasta.so:fasta.c
gcc -fPIC -I -g -c -Wall -I ${R_HOME}/include fasta.c
gcc -shared -Wl,-soname,fasta.so.1 -o $@ fasta.o

Testing

I've downloaded some fasta sequences from dbSNP/Apis Mellifera. This file is open with fasta.open, we loop over each sequence and we print those sequences having a length lower than 300bp, at the end we close the stream with fasta.close:
f<-fasta.open("rs_chLG1.fas")

while(!is.null(snp <-fasta.next(f)))
{
if(nchar(as.character(snp$sequence))< 300)
print(snp)
}

fasta.close(f)

Result:
$name
[1] "gnl|dbSNP|rs44106732 rs=44106732|pos=48|len=298|taxid=7460|mol=\"genomic\"|class=1|alleles=\"C/T\"|build=127"

$sequence
[1] "GAAAATCAAAGACAATTTTTGGAATGGAACTAAATAACTTTATTCTTYCTTTCTTTCTCGTCGAGAATATCGTTATCGTTGCATGGGTTATGGAATAGCGTTCGTTAAAAATGTTATATTTCGAGGAAATATCGAAGATAGGCTTTGCGAAAGTCTGTTTCTCTAGAATTAAGATTTATATGTGTTGCAAGGGGAAGTTCAAAGAGAAATCGTGGCCAGTTCGAACATTATTATGTCTATCAATGATCGAGAAGTGTCATTGATGCAAGAGAAAAGTTTTCTCTTGCATTTAAGTATC"

$name
[1] "gnl|dbSNP|rs44108824 rs=44108824|pos=251|len=268|taxid=7460|mol=\"genomic\"|class=1|alleles=\"C/T\"|build=127"

$sequence
[1] "GTTAAATGGGAATTTTGGGGATTATTGGAGGAGGATTTACGTTTCGAGGATTGTTGATGATCTTAGGATTGTTTTCAGTTTGGAATTTTTTCTTCTTCTTCAACGTTGTACAACATTCTCCTCGAATTTTGTGTAACGAGGAGGATTAAACCTTTGGAAAATCACGTAAATTAGAGGACGATATATTGGGTTGGCAACTAAGTAATTGCGGATTTTTTTTAGAAAATCAAAGACAATTTTTGGAATGGAAYTAAATAACTTTATTCTT"

$name
[1] "gnl|dbSNP|rs44150807 rs=44150807|pos=251|len=274|taxid=7460|mol=\"genomic\"|class=1|alleles=\"C/T\"|build=127"

$sequence
[1] "TTCGATCTTCGTTTCACGCTCACGTTTCACGTTTCTCGTCCGGCGTAACGAACGGTATTCCGCTGATCACGAATAATTTCTTTCTGGAGAGTCCATTAGGGGACCGTCCCCTCCCCCCTCTCGCGCACACAGACACCCATCGCTTTCGACGCCTCGTCCATTCGAGGGAGAAACGAACGACTATTAGAAAAAAATCTTCTTTATATCTCTATAAATTCAATTTGCAGAGAAGCAAAGAGCTTTAAAATATYAACCATTATAACCGAACTTGTTG"

(....)


Full Source code


Makefile


run:libfasta.so rs_chLG1.fas
${R_HOME}/bin/R --no-save < fasta.R

rs_chLG1.fas:
wget -O $@.gz ftp://ftp.ncbi.nih.gov/snp/organisms/bee_7460/rs_fasta/rs_chLG1.fas.gz
gunzip $@.gz

libfasta.so:fasta.c
gcc -fPIC -I -g -c -Wall -I ${R_HOME}/include fasta.c
gcc -shared -Wl,-soname,fasta.so.1 -o $@ fasta.o

fasta.c


#include <ctype.h>
#include <errno.h>
#include <R.h>
#include <Rinternals.h>

/** stateful structure for the fastafile */
typedef struct fastaHandler_t
{
/** input file */
FILE* in;
/** save previous line, it will contains the next fasta header */
char* previous_line;
/** sequence name */
char *seq_name;
/** sequence dna */
char *seq_dna;
/** sequence_length */
int seq_length;

}FastaHandler,*FastaHandlerPtr;


/**
* return the next non empty line from a FastaHandlerPtr
* should be free()
*/
static char* readLine(FastaHandlerPtr handle)
{
int c;
char* ptr=NULL;
int length=0;
/* if there was a saved line, return it */
if(handle->previous_line!=NULL)
{
ptr=handle->previous_line;
handle->previous_line=NULL;
return ptr;
}
/** while the line is empty */
while(length==0)
{
int capacity=0;
//no more to read ?
if(feof(handle->in)) return NULL;
ptr=NULL;
//read each char
while((c=fgetc(handle->in))!=EOF)
{
//if this is an EOL, break
if(c=='\n') break;
//enlarge the buffer if it is too small
if(ptr==NULL || (length+2)>=capacity)
{
capacity+=500;
ptr=(char*)realloc(ptr,capacity*sizeof(char));
if(ptr==NULL) error("cannot realloc to %d bytes",capacity);
}
//append the char to the buffer
ptr[(length)++]=c;
}
//nothing was read (empty line) ? continue
if(length==0)
{
free(ptr);
ptr=NULL;
}
}
//add a end-of-string char
ptr[length]=0;
//return the line
return ptr;
}

/**
* open a fasta file, init the FastaHandler
*/
SEXP fasta_open(SEXP filename)
{
FastaHandlerPtr handle=NULL;
if(!isString(filename)) error("filename is not a string");
if(length(filename)!=1) error("expected only one filename");

handle=(FastaHandlerPtr)calloc(1,sizeof(FastaHandler));
if(handle==NULL)
{
error("Cannot alloc FastaHandler");
}
const char* c_filename=CHAR(STRING_ELT(filename,0));
errno=0;
handle->in= fopen( c_filename,"r");
if(handle->in==NULL)
{
error("Cannot open \"%s\": %s",c_filename,strerror(errno));
}
/** the handle is bound a R variable */
return R_MakeExternalPtr(handle, R_NilValue, R_NilValue);
}

/** create a pair list containing the name and the dna */
static SEXP make_sequence(const char* seq_name,const char* length)
{
SEXP values,names;
//the R value contains two objects
PROTECT(values = allocVector(VECSXP, 2));
//first item is the sequence name
SET_VECTOR_ELT(values, 0, mkString(seq_name));
//second item is the sequence dna
SET_VECTOR_ELT(values, 1, mkString(length));

//the labels
PROTECT(names = allocVector(STRSXP, 2));
//first label is the name
SET_STRING_ELT(names, 0, mkChar("name"));
//2nd label is the name
SET_STRING_ELT(names, 1, mkChar("sequence"));

//bind the labels to the values
setAttrib(values, R_NamesSymbol, names);
UNPROTECT(2);
return values;
}

/**
* read the next fasta sequence
*/
SEXP fasta_next(SEXP r_handle)
{
int count=0;
char* ptr=NULL;
FastaHandlerPtr handle = R_ExternalPtrAddr(r_handle);;
if(handle==NULL) error("handle==NULL");
if(handle->in==NULL) error("handle->in==NULL");

while((ptr=readLine(handle))!=NULL)
{
if(ptr[0]=='>')
{
//this is the header of a fasta sequence
if(handle->seq_name!=NULL || handle->seq_dna!=NULL)
{
handle->previous_line=ptr;
SEXP return_value=NULL;
//if there a sequence in memory ?
if( handle->seq_name!=NULL &&
handle->seq_dna!=NULL &&
handle->seq_length>0)
{
//create the sequence
return_value=make_sequence(
&handle->seq_name[1],
handle->seq_dna
);
}
free(handle->seq_name);
free(handle->seq_dna);
handle->seq_name=NULL;
handle->seq_dna=NULL;
handle->seq_length=0;
//return the sequence if any
if(return_value!=NULL) return return_value;
}
//cleanup
handle->seq_name=ptr;
handle->seq_length=0;
handle->seq_dna=NULL;
}
else
{
int j=0;
int len=0;
//remove blank characters
while(ptr[j]!=0)
{
if(!isspace(ptr[j]))
{
ptr[len++]=ptr[j];
}
++j;
}
//enlarge the dna sequence
handle->seq_dna=realloc(
handle->seq_dna,
sizeof(char)*(handle->seq_length+1+len)
);
if(handle->seq_dna==NULL) error("cannot realloc seq_dna");
//append the new line
memcpy(&handle->seq_dna[handle->seq_length],ptr,sizeof(char)*len);
handle->seq_length+=len;
handle->seq_dna[handle->seq_length]=0;
free(ptr);
ptr=NULL;
}
}
//last sequence
SEXP return_value=NULL;
if( handle->seq_name!=NULL &&
handle->seq_dna!=NULL &&
handle->seq_length>0)
{
return_value=make_sequence(
&handle->seq_name[1],
handle->seq_dna
);
}
free(handle->seq_name);
free(handle->seq_dna);
handle->seq_name=NULL;
handle->seq_dna=NULL;
handle->seq_length=0;
return (return_value==NULL?R_NilValue:return_value);
}

/**
* close the FastaHandlerPtr
*/
SEXP fasta_close(SEXP r_handle)
{
FastaHandlerPtr handle = R_ExternalPtrAddr(r_handle);;
if(handle==NULL) error("handle==NULL");
if(handle->in!=NULL)
{
fclose(handle->in);
handle->in=NULL;
}
if(handle->previous_line!=NULL)
{
free(handle->previous_line);
handle->previous_line=NULL;
}
free(handle->seq_name);
handle->seq_name=NULL;
free(handle->seq_dna);
handle->seq_dna=NULL;
R_ClearExternalPtr(r_handle);
return ScalarInteger(0);
}

fasta.R


dyn.load(paste("libfasta", .Platform$dynlib.ext, sep=""))

fasta.open <- function(filename)
{
.Call("fasta_open", filename)
}
fasta.close <- function(handler)
{
.Call("fasta_close",handler)
}

fasta.next <- function(handler)
{
.Call("fasta_next",handler)
}

f<-fasta.open("rs_chLG1.fas")

while(!is.null(snp <-fasta.next(f)))
{
if(nchar(as.character(snp$sequence))< 300)
print(snp)
}

fasta.close(f)

That's it !
Pierre

No comments: