Parsing the Newick format in C using flex and bison.
The following post is my answer for this question on biostar "Newick 2 Json converter".
The Newick tree format is a simple format used to write out trees (using parentheses and commas) in a text file .
The original question asked for a parser based on perl but here, I've implemented a C parser using flex/bison.
Example:
((Human:0.3, Chimpanzee:0.2):0.1, Gorilla:0.3, (Mouse:0.6, Rat:0.5):0.2);
A formal grammar for the Newick format is available here
Items in { } may appear zero or more times. Items in [ ] are optional, they may appear once or not at all. All other punctuation marks (colon, semicolon, parentheses, comma and single quote) are required parts of the format. tree ==> descendant_list [ root_label ] [ : branch_length ] ; descendant_list ==> ( subtree { , subtree } ) subtree ==> descendant_list [internal_node_label] [: branch_length] ==> leaf_label [: branch_length] root_label ==> label internal_node_label ==> label leaf_label ==> label label ==> unquoted_label ==> quoted_label unquoted_label ==> string_of_printing_characters quoted_label ==> ' string_of_printing_characters ' branch_length ==> signed_number ==> unsigned_number
The Flex Lexer
The Flex Lexer is used to extract the terminal tokens of the grammar from the input stream.Those terminals are '(' ')' ',' ';' ':' , strings and numbers. For the simple and double quoted strings, we tell the lexer to enter in a specific state ( 'apos' and 'quot').
%{ | |
#include "newick.tab.h" | |
static size_t string_length=0; | |
void* saferealloc(void *ptr, size_t size) | |
{ | |
char* p= realloc(ptr,sizeof(char)*(size)); | |
if(p==NULL) | |
{ | |
fprintf(stderr,"out of memory"); | |
exit(EXIT_FAILURE); | |
} | |
return p; | |
} | |
static char* copy(const char* s,size_t length) | |
{ | |
char* p= saferealloc(NULL,sizeof(char)*(length+1)); | |
strncpy(p,s,length); | |
p[length]=0; | |
return p; | |
} | |
static char* append(size_t* len,const char* s2,size_t len2) | |
{ | |
yylval.s= saferealloc( yylval.s,sizeof(char)*(*len+len2+1)); | |
strncpy(&yylval.s[*len],s2,len2); | |
yylval.s[*len+len2]=0; | |
*len+=len2; | |
return yylval.s; | |
} | |
%} | |
%s apos | |
%s quot | |
%% | |
<quot>{ | |
\\\" append(&string_length,"\"",1); | |
\' append(&string_length,"\'",1); | |
\" {BEGIN(INITIAL);return STRING;} | |
} | |
<apos>{ | |
\\\' append(&string_length,"\'",1); | |
\" append(&string_length,"\"",1); | |
\' {BEGIN(INITIAL);return STRING;} | |
} | |
<apos,quot>{ | |
\\n append(&string_length,"\n",1); | |
\\t append(&string_length,"\t",1); | |
\\\\ append(&string_length,"\\",1); | |
([^\"\'\\]|\n)+ append(&string_length,yytext,yyleng); | |
} | |
\: return COLON; | |
\; return SEMICOLON; | |
\) return CPAR; | |
\( return OPAR; | |
\, return COMMA; | |
\" {string_length=0;BEGIN(quot); } | |
\' {string_length=0;BEGIN(apos); } | |
[a-zA-Z_][a-zA-Z_0-9]* {yylval.s=copy(yytext,yyleng); return STRING;} | |
[\+|\-]?[0-9]+ {yylval.d=copy(yytext,yyleng); return NUMBER;} | |
[\+|\-]?[0-9]+\.[0-9]+([e|E][0-9]+)? {yylval.d=copy(yytext,yyleng); return NUMBER;} | |
[ \t\n\r] ; | |
. {fprintf(stderr,"Syntax error (%c)\n",yytext[0]);exit(EXIT_FAILURE);} | |
%% |
The Bison Scanner
The Bison scanner reads the tokens returned by Flex and implements the grammar.The simple structure holding the tree is defined in
'struct tree_t'
. The code also contains some methods to dump the tree as JSON. %{ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <errno.h> | |
#include <string.h> | |
extern int yylex(); | |
extern FILE* yyin; | |
extern void* saferealloc(void *ptr, size_t size); | |
int yywrap() { return 1;} | |
void yyerror(const char* s) {fprintf(stderr,"Error:\"%s\".\n",s);} | |
struct tree_t | |
{ | |
char* label; | |
char* length; | |
struct tree_t* child; | |
struct tree_t* next; | |
}; | |
typedef struct tree_t Tree; | |
static Tree* newTree() | |
{ | |
Tree* t=saferealloc(0,sizeof(Tree)); | |
memset(t,0,sizeof(Tree)); | |
return t; | |
} | |
static void freeTree(Tree* t) | |
{ | |
if(t==NULL) return; | |
free(t->label); | |
free(t->length); | |
freeTree(t->child); | |
freeTree(t->next); | |
free(t); | |
} | |
static void escape(FILE* out,const char* s) | |
{ | |
if(s==NULL) { fputs("null",out); return;} | |
fputc('\"',out); | |
while(*s!=0) | |
{ | |
switch(*s) | |
{ | |
case '\'': fputs("\\\'",out); break; | |
case '\"': fputs("\\\"",out); break; | |
case '\\': fputs("\\\\",out); break; | |
case '\n': fputs("\\n",out); break; | |
case '\r': fputs("\\r",out); break; | |
case '\t': fputs("\\t",out); break; | |
default : fputc(*s,out); break; | |
} | |
++s; | |
} | |
fputc('\"',out); | |
} | |
static void num(FILE* out,const char* d) | |
{ | |
if(d==NULL) { fputs("null",out);; return;} | |
fputs(d,out); | |
} | |
static void printTree(FILE* out,const Tree* t) | |
{ | |
int comma=0; | |
fputs("{",out); | |
if(t->label!=NULL) { fputs("\"label\":",out);escape(out,t->label); comma++;} | |
if(t->length!=NULL) {if(comma!=0) fputc(',',out);fputs("\"length\":",out);num(out,t->length);comma++;} | |
if(t->child!=0) | |
{ | |
const Tree* c=t->child; | |
int i=0; | |
if(comma!=0) {fputc(',',out); ++comma;} | |
fputs("\"children\":[",out); | |
while(c!=NULL) | |
{ | |
if(i!=0) fputc(',',out); | |
printTree(out,c); | |
c=c->next; | |
++i; | |
} | |
fputc(']',out); | |
} | |
fputs("}",out); | |
} | |
%} | |
%union { | |
char* s; | |
char* d; | |
struct tree_t* tree; | |
} | |
%error-verbose | |
%token OPAR | |
%token CPAR | |
%token COMMA | |
%token COLON SEMICOLON | |
%token<s> STRING | |
%token<d> NUMBER | |
%type<s> label optional_label | |
%type<d> number optional_length | |
%type<tree> subtree descendant_list_item descendant_list | |
%start input | |
%% | |
input: descendant_list optional_label optional_length SEMICOLON | |
{ | |
Tree* tree=newTree(); | |
//tree->type=ROOT; | |
tree->child=$1; | |
tree->label=$2; | |
tree->length=$3; | |
printTree(stdout,tree); | |
freeTree(tree); | |
}; | |
descendant_list: OPAR descendant_list_item CPAR | |
{ | |
$$=$2; | |
}; | |
descendant_list_item: subtree | |
{ | |
$$=$1; | |
} | |
|descendant_list_item COMMA subtree | |
{ | |
Tree* last=$1; | |
$$=$1; | |
while(last->next!=NULL) | |
{ | |
last=last->next; | |
} | |
last->next=$3; | |
} | |
; | |
subtree : descendant_list optional_label optional_length | |
{ | |
$$=newTree(); | |
$$->child=$1; | |
$$->label=$2; | |
$$->length=$3; | |
} | |
| label optional_length | |
{ | |
$$=newTree(); | |
$$->label=$1; | |
$$->length=$2; | |
} | |
; | |
optional_label: { $$=NULL;} | label { $$=$1;}; | |
optional_length: { $$=NULL;} | COLON number { $$=$2;}; | |
label: STRING { $$=$1;}; | |
number: NUMBER { $$=$1;}; | |
%% | |
int main(int argc,char** argv) | |
{ | |
yyin=stdin; | |
yyparse(); | |
return EXIT_SUCCESS; | |
} | |
Makefile
all: | |
bison -d newick.y | |
flex newick.l | |
gcc -Wall -O3 newick.tab.c lex.yy.c | |
echo "(A,B,(C,D)E)F;" | ./a.out |
Testing
compile:$ make bison -d newick.y flex newick.l gcc -Wall -O3 newick.tab.c lex.yy.c lex.yy.c:1265:17: warning: ‘yyunput’ defined but not used [-Wunused-function] lex.yy.c:1306:16: warning: ‘input’ defined but not used [-Wunused-function]test:
echo "((Human:0.3, Chimpanzee:0.2):0.1, Gorilla:0.3, (Mouse:0.6, Rat:0.5):0.2);" | ./a.out { "children": [ { "length": 0.1, "children": [ { "label": "Human", "length": 0.3 }, { "label": "Chimpanzee", "length": 0.2 } ] }, { "label": "Gorilla", "length": 0.3 }, { "length": 0.2, "children": [ { "label": "Mouse", "length": 0.6 }, { "label": "Rat", "length": 0.5 } ] } ] }
That's it,
Pierre