Parsing the Newick format in C using flex and bison.
The following post is my answer for this question on biostar "Newick 2 Json converter".
The Newick tree format is a simple format used to write out trees (using parentheses and commas) in a text file .
The original question asked for a parser based on perl but here, I've implemented a C parser using flex/bison.
Example:
((Human:0.3, Chimpanzee:0.2):0.1, Gorilla:0.3, (Mouse:0.6, Rat:0.5):0.2);
A formal grammar for the Newick format is available here
Items in { } may appear zero or more times. Items in [ ] are optional, they may appear once or not at all. All other punctuation marks (colon, semicolon, parentheses, comma and single quote) are required parts of the format. tree ==> descendant_list [ root_label ] [ : branch_length ] ; descendant_list ==> ( subtree { , subtree } ) subtree ==> descendant_list [internal_node_label] [: branch_length] ==> leaf_label [: branch_length] root_label ==> label internal_node_label ==> label leaf_label ==> label label ==> unquoted_label ==> quoted_label unquoted_label ==> string_of_printing_characters quoted_label ==> ' string_of_printing_characters ' branch_length ==> signed_number ==> unsigned_number
The Flex Lexer
The Flex Lexer is used to extract the terminal tokens of the grammar from the input stream.Those terminals are '(' ')' ',' ';' ':' , strings and numbers. For the simple and double quoted strings, we tell the lexer to enter in a specific state ( 'apos' and 'quot').
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%{ | |
#include "newick.tab.h" | |
static size_t string_length=0; | |
void* saferealloc(void *ptr, size_t size) | |
{ | |
char* p= realloc(ptr,sizeof(char)*(size)); | |
if(p==NULL) | |
{ | |
fprintf(stderr,"out of memory"); | |
exit(EXIT_FAILURE); | |
} | |
return p; | |
} | |
static char* copy(const char* s,size_t length) | |
{ | |
char* p= saferealloc(NULL,sizeof(char)*(length+1)); | |
strncpy(p,s,length); | |
p[length]=0; | |
return p; | |
} | |
static char* append(size_t* len,const char* s2,size_t len2) | |
{ | |
yylval.s= saferealloc( yylval.s,sizeof(char)*(*len+len2+1)); | |
strncpy(&yylval.s[*len],s2,len2); | |
yylval.s[*len+len2]=0; | |
*len+=len2; | |
return yylval.s; | |
} | |
%} | |
%s apos | |
%s quot | |
%% | |
<quot>{ | |
\\\" append(&string_length,"\"",1); | |
\' append(&string_length,"\'",1); | |
\" {BEGIN(INITIAL);return STRING;} | |
} | |
<apos>{ | |
\\\' append(&string_length,"\'",1); | |
\" append(&string_length,"\"",1); | |
\' {BEGIN(INITIAL);return STRING;} | |
} | |
<apos,quot>{ | |
\\n append(&string_length,"\n",1); | |
\\t append(&string_length,"\t",1); | |
\\\\ append(&string_length,"\\",1); | |
([^\"\'\\]|\n)+ append(&string_length,yytext,yyleng); | |
} | |
\: return COLON; | |
\; return SEMICOLON; | |
\) return CPAR; | |
\( return OPAR; | |
\, return COMMA; | |
\" {string_length=0;BEGIN(quot); } | |
\' {string_length=0;BEGIN(apos); } | |
[a-zA-Z_][a-zA-Z_0-9]* {yylval.s=copy(yytext,yyleng); return STRING;} | |
[\+|\-]?[0-9]+ {yylval.d=copy(yytext,yyleng); return NUMBER;} | |
[\+|\-]?[0-9]+\.[0-9]+([e|E][0-9]+)? {yylval.d=copy(yytext,yyleng); return NUMBER;} | |
[ \t\n\r] ; | |
. {fprintf(stderr,"Syntax error (%c)\n",yytext[0]);exit(EXIT_FAILURE);} | |
%% |
The Bison Scanner
The Bison scanner reads the tokens returned by Flex and implements the grammar.The simple structure holding the tree is defined in
'struct tree_t'
. The code also contains some methods to dump the tree as JSON.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%{ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <errno.h> | |
#include <string.h> | |
extern int yylex(); | |
extern FILE* yyin; | |
extern void* saferealloc(void *ptr, size_t size); | |
int yywrap() { return 1;} | |
void yyerror(const char* s) {fprintf(stderr,"Error:\"%s\".\n",s);} | |
struct tree_t | |
{ | |
char* label; | |
char* length; | |
struct tree_t* child; | |
struct tree_t* next; | |
}; | |
typedef struct tree_t Tree; | |
static Tree* newTree() | |
{ | |
Tree* t=saferealloc(0,sizeof(Tree)); | |
memset(t,0,sizeof(Tree)); | |
return t; | |
} | |
static void freeTree(Tree* t) | |
{ | |
if(t==NULL) return; | |
free(t->label); | |
free(t->length); | |
freeTree(t->child); | |
freeTree(t->next); | |
free(t); | |
} | |
static void escape(FILE* out,const char* s) | |
{ | |
if(s==NULL) { fputs("null",out); return;} | |
fputc('\"',out); | |
while(*s!=0) | |
{ | |
switch(*s) | |
{ | |
case '\'': fputs("\\\'",out); break; | |
case '\"': fputs("\\\"",out); break; | |
case '\\': fputs("\\\\",out); break; | |
case '\n': fputs("\\n",out); break; | |
case '\r': fputs("\\r",out); break; | |
case '\t': fputs("\\t",out); break; | |
default : fputc(*s,out); break; | |
} | |
++s; | |
} | |
fputc('\"',out); | |
} | |
static void num(FILE* out,const char* d) | |
{ | |
if(d==NULL) { fputs("null",out);; return;} | |
fputs(d,out); | |
} | |
static void printTree(FILE* out,const Tree* t) | |
{ | |
int comma=0; | |
fputs("{",out); | |
if(t->label!=NULL) { fputs("\"label\":",out);escape(out,t->label); comma++;} | |
if(t->length!=NULL) {if(comma!=0) fputc(',',out);fputs("\"length\":",out);num(out,t->length);comma++;} | |
if(t->child!=0) | |
{ | |
const Tree* c=t->child; | |
int i=0; | |
if(comma!=0) {fputc(',',out); ++comma;} | |
fputs("\"children\":[",out); | |
while(c!=NULL) | |
{ | |
if(i!=0) fputc(',',out); | |
printTree(out,c); | |
c=c->next; | |
++i; | |
} | |
fputc(']',out); | |
} | |
fputs("}",out); | |
} | |
%} | |
%union { | |
char* s; | |
char* d; | |
struct tree_t* tree; | |
} | |
%error-verbose | |
%token OPAR | |
%token CPAR | |
%token COMMA | |
%token COLON SEMICOLON | |
%token<s> STRING | |
%token<d> NUMBER | |
%type<s> label optional_label | |
%type<d> number optional_length | |
%type<tree> subtree descendant_list_item descendant_list | |
%start input | |
%% | |
input: descendant_list optional_label optional_length SEMICOLON | |
{ | |
Tree* tree=newTree(); | |
//tree->type=ROOT; | |
tree->child=$1; | |
tree->label=$2; | |
tree->length=$3; | |
printTree(stdout,tree); | |
freeTree(tree); | |
}; | |
descendant_list: OPAR descendant_list_item CPAR | |
{ | |
$$=$2; | |
}; | |
descendant_list_item: subtree | |
{ | |
$$=$1; | |
} | |
|descendant_list_item COMMA subtree | |
{ | |
Tree* last=$1; | |
$$=$1; | |
while(last->next!=NULL) | |
{ | |
last=last->next; | |
} | |
last->next=$3; | |
} | |
; | |
subtree : descendant_list optional_label optional_length | |
{ | |
$$=newTree(); | |
$$->child=$1; | |
$$->label=$2; | |
$$->length=$3; | |
} | |
| label optional_length | |
{ | |
$$=newTree(); | |
$$->label=$1; | |
$$->length=$2; | |
} | |
; | |
optional_label: { $$=NULL;} | label { $$=$1;}; | |
optional_length: { $$=NULL;} | COLON number { $$=$2;}; | |
label: STRING { $$=$1;}; | |
number: NUMBER { $$=$1;}; | |
%% | |
int main(int argc,char** argv) | |
{ | |
yyin=stdin; | |
yyparse(); | |
return EXIT_SUCCESS; | |
} | |
Makefile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all: | |
bison -d newick.y | |
flex newick.l | |
gcc -Wall -O3 newick.tab.c lex.yy.c | |
echo "(A,B,(C,D)E)F;" | ./a.out |
Testing
compile:$ make bison -d newick.y flex newick.l gcc -Wall -O3 newick.tab.c lex.yy.c lex.yy.c:1265:17: warning: ‘yyunput’ defined but not used [-Wunused-function] lex.yy.c:1306:16: warning: ‘input’ defined but not used [-Wunused-function]test:
echo "((Human:0.3, Chimpanzee:0.2):0.1, Gorilla:0.3, (Mouse:0.6, Rat:0.5):0.2);" | ./a.out { "children": [ { "length": 0.1, "children": [ { "label": "Human", "length": 0.3 }, { "label": "Chimpanzee", "length": 0.2 } ] }, { "label": "Gorilla", "length": 0.3 }, { "length": 0.2, "children": [ { "label": "Mouse", "length": 0.6 }, { "label": "Rat", "length": 0.5 } ] } ] }
That's it,
Pierre
4 comments:
This is very fun to read, and it brings back memories. I used to work a fair amount with lex and yacc. I didn't try out your code, but I have no doubt that it works.
I have a quick question. If I were to try to code something like this, I don't think I would go back to these tools; I think I'd either do it in Perl, or I'd use it as an excuse to try out antlr (have you heard of it?). So the question is, why flex and bison?
Chris, I wrote it in lex/bison because it is just fun dans my code was just a roughly copy-paste of my C code for parsing JSON :-)
About another language: ( I'm not a perl programmer ) I would use JAVACC rather than antlr because javacc doesn't require any dependencies.
Hello, I am an Italian student and a university project I should do a tree with flex and bison , as what you did in the post . Very good !. I tried your code but i get error :
missing "newick.tab.h".
It was a file to create bison?
I'm sorry for my english.
@giovanna show me the full error trace or look at this git repo: https://github.com/lindenb/newicktools
it contains the recent sources.
Post a Comment