Parsing a genetic code using flex and bison. Part 1/2
In this post I'll show how to write a lexer and a parser using Flex and Bison for parsing the genetic codes at NCBI. I'll write a non-reentrant parser that is to say that the generated code contains some global variables and cannot be safely executed concurrently. The input file is an ASN1 file that looks like this:
file gc.h:
%type<gCode> code
(...)
code: OPEN
NAME TEXT COMMA
optional_name
ID INTEGER COMMA
NCBIEAA TEXT COMMA
SNCBIEAA TEXT
CLOSE
{
$$ = malloc(sizeof(GeneticCodeStruct));
if($$==NULL) exit(EXIT_FAILURE);
$$->name1=$3;
$$->name2=$5;
$$->id=$7;
$$->ncbieaa=$10;
$$->sncbieaa=$13;
}
;
An optional name is 'nothing' or the following tokens: 'NAME TEXT COMMA'
A series of genetic codes is one genetic code, or a series of genetic codes followed by one genetic code:
All in one here is the flex file: gc.l
That's it.
Pierre
In the next post I'll write a reentrant parser using flex and bison.
--*************************************************************************
-- This is the NCBI genetic code table
-- (...)
--**************************************************************************
Genetic-code-table ::= {
{
name "Standard" ,
name "SGC0" ,
id 1 ,
ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
sncbieaa "---M---------------M---------------M----------------------------"
-- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
-- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
-- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
},
{
(...)
} ,
{
name "Thraustochytrium Mitochondrial" ,
id 23 ,
ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
sncbieaa "--------------------------------M--M---------------M------------"
-- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
-- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
-- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
}
}
Each genetic code will be stored as a linked list of C structures called GeneticCode:-- This is the NCBI genetic code table
-- (...)
--**************************************************************************
Genetic-code-table ::= {
{
name "Standard" ,
name "SGC0" ,
id 1 ,
ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
sncbieaa "---M---------------M---------------M----------------------------"
-- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
-- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
-- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
},
{
(...)
} ,
{
name "Thraustochytrium Mitochondrial" ,
id 23 ,
ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
sncbieaa "--------------------------------M--M---------------M------------"
-- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
-- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
-- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
}
}
file gc.h:
#ifndef GENETIC_CODE_H
#define GENETIC_CODE_H
typedef struct geneticCode
{
char* name1;
char* name2;
int id;
char* ncbieaa;
char* sncbieaa;
struct geneticCode *next;
}GeneticCodeStruct,*GeneticCode;
#endif
.The LEXER generated by flex converts the input into a sequence of tokens, and the parser generated by bison converts a grammar description into a C program.#define GENETIC_CODE_H
typedef struct geneticCode
{
char* name1;
char* name2;
int id;
char* ncbieaa;
char* sncbieaa;
struct geneticCode *next;
}GeneticCodeStruct,*GeneticCode;
#endif
The Parser
The grammar of Bison says that the lexical context might carry a string (the code ID) , a string (ncbieaa,ncbieaa...) or a GeneticCode:%union {
char* s;
int integer;
GeneticCode gCode;
}
(...)
%token<integer> INTEGER
%token<s> TEXT
A genetic code is a series of tokens consisting in one or two names, the id, the ncbiaa and the sncbiaa strings. Every time this context is found, a new GeneticCode is allocated and its fields are filled:char* s;
int integer;
GeneticCode gCode;
}
(...)
%token<integer> INTEGER
%token<s> TEXT
%type<gCode> code
(...)
code: OPEN
NAME TEXT COMMA
optional_name
ID INTEGER COMMA
NCBIEAA TEXT COMMA
SNCBIEAA TEXT
CLOSE
{
$$ = malloc(sizeof(GeneticCodeStruct));
if($$==NULL) exit(EXIT_FAILURE);
$$->name1=$3;
$$->name2=$5;
$$->id=$7;
$$->ncbieaa=$10;
$$->sncbieaa=$13;
}
;
optional_name:/* nothing */
{
$$=NULL;
}
| NAME TEXT COMMA
{
$$=$2;
}
{
$$=NULL;
}
| NAME TEXT COMMA
{
$$=$2;
}
A series of genetic codes is one genetic code, or a series of genetic codes followed by one genetic code:
codes: code
{
$$=$1;
}
| codes COMMA code
{
$$=$3;
$$->next=$1;
}
;
At last, the input constists in the tokens 'GENETIC_CODE_TABLE ASSIGN OPEN' and 'CLOSE' , surrounding a series of genetic codes. Each time, an input has been parsed, the linked list is scanned so each genetic code is printed to stdout.{
$$=$1;
}
| codes COMMA code
{
$$=$3;
$$->next=$1;
}
;
input: GENETIC_CODE_TABLE ASSIGN OPEN codes CLOSE
{
GeneticCode gc=$4;
fputs("Genetic-code-table ::= {",stdout);
while(gc!=NULL)
{
printf("{name \"%s\",", gc->name1);
if(gc->name2!=NULL) printf("name \"%s\",", gc->name2);
printf("id %d,ncbieaa \"%s\",sncbieaa \"%s\"}",
gc->id, gc->ncbieaa, gc->sncbieaa
);
if(gc->next!=NULL) fputc(',',stdout);
gc=gc->next;
}
fputc('}',stdout);
/** free memory here.... */
};
{
GeneticCode gc=$4;
fputs("Genetic-code-table ::= {",stdout);
while(gc!=NULL)
{
printf("{name \"%s\",", gc->name1);
if(gc->name2!=NULL) printf("name \"%s\",", gc->name2);
printf("id %d,ncbieaa \"%s\",sncbieaa \"%s\"}",
gc->id, gc->ncbieaa, gc->sncbieaa
);
if(gc->next!=NULL) fputc(',',stdout);
gc=gc->next;
}
fputc('}',stdout);
/** free memory here.... */
};
The Lexer
The lexers breaks the input into tokens. In this lexer, comments are ignored:\-\-.*\n ;/* ignore comments */
... keywords are returned Genetic\-code\-table return GENETIC_CODE_TABLE;
name return NAME;
id return ID;
ncbieaa return NCBIEAA;
sncbieaa return SNCBIEAA;
\:\:= return ASSIGN;
, return COMMA;
\{ return OPEN;
\} return CLOSE;
... and the tokens carrying a context (integers, strings...) are decoded:name return NAME;
id return ID;
ncbieaa return NCBIEAA;
sncbieaa return SNCBIEAA;
\:\:= return ASSIGN;
, return COMMA;
\{ return OPEN;
\} return CLOSE;
[0-9]+ {gc_lval.integer=atoi(yytext) ;return INTEGER;}
\"[^\"]+\" {
//copy the string without the quotes
gc_lval.s= malloc(yyleng-1);
if(gc_lval.s==NULL) exit(EXIT_FAILURE);
strncpy(gc_lval.s,&yytext[1],yyleng-2);
return TEXT;
}
\"[^\"]+\" {
//copy the string without the quotes
gc_lval.s= malloc(yyleng-1);
if(gc_lval.s==NULL) exit(EXIT_FAILURE);
strncpy(gc_lval.s,&yytext[1],yyleng-2);
return TEXT;
}
All in one here is the flex file: gc.l
%option prefix="gc_"
%option noyywrap
%{
#include <stdio.h>
#include <stdlib.h>
#include "gc.h"
#include "gc.tab.h" //generated by bison
%}
%%
\-\-.*\n ;/* ignore comments */
Genetic\-code\-table return GENETIC_CODE_TABLE;
name return NAME;
id return ID;
ncbieaa return NCBIEAA;
sncbieaa return SNCBIEAA;
[0-9]+ {gc_lval.integer=atoi(yytext) ;return INTEGER;}
\:\:= return ASSIGN;
, return COMMA;
\{ return OPEN;
\} return CLOSE;
\"[^\"]+\" {
gc_lval.s= malloc(yyleng-1);
if(gc_lval.s==NULL) exit(EXIT_FAILURE);
strncpy(gc_lval.s,&yytext[1],yyleng-2);
return TEXT;
}
[\n\t ] ;//ignore blanks
. return yytext[0];
%%
... the bison file gc.y:%option noyywrap
%{
#include <stdio.h>
#include <stdlib.h>
#include "gc.h"
#include "gc.tab.h" //generated by bison
%}
%%
\-\-.*\n ;/* ignore comments */
Genetic\-code\-table return GENETIC_CODE_TABLE;
name return NAME;
id return ID;
ncbieaa return NCBIEAA;
sncbieaa return SNCBIEAA;
[0-9]+ {gc_lval.integer=atoi(yytext) ;return INTEGER;}
\:\:= return ASSIGN;
, return COMMA;
\{ return OPEN;
\} return CLOSE;
\"[^\"]+\" {
gc_lval.s= malloc(yyleng-1);
if(gc_lval.s==NULL) exit(EXIT_FAILURE);
strncpy(gc_lval.s,&yytext[1],yyleng-2);
return TEXT;
}
[\n\t ] ;//ignore blanks
. return yytext[0];
%%
%{
#include <stdio.h>
#include <stdlib.h>
#include "gc.h"
%}
%union {
char* s;
int integer;
GeneticCode gCode;
}
%name-prefix="gc_"
%defines
%error-verbose
%token GENETIC_CODE_TABLE NAME OPEN CLOSE COMMA ASSIGN NCBIEAA SNCBIEAA ID
%token<integer> INTEGER
%token<s> TEXT
%type<gCode> code
%type<gCode> codes
%type<s> optional_name
%start input
%{
void yyerror(const char* message)
{
fprintf(stderr,"ERROR:%s\n",message);
}
%}
%%
input: GENETIC_CODE_TABLE ASSIGN OPEN codes CLOSE
{
GeneticCode gc=$4;
fputs("Genetic-code-table ::= {",stdout);
while(gc!=NULL)
{
printf("{name \"%s\",", gc->name1);
if(gc->name2!=NULL) printf("name \"%s\",", gc->name2);
printf("id %d,ncbieaa \"%s\",sncbieaa \"%s\"}",
gc->id, gc->ncbieaa, gc->sncbieaa
);
if(gc->next!=NULL) fputc(',',stdout);
gc=gc->next;
}
fputc('}',stdout);
/** free memory here.... */
};
codes: code
{
$$=$1;
}
| codes COMMA code
{
$$=$3;
$$->next=$1;
}
;
code: OPEN
NAME TEXT COMMA
optional_name
ID INTEGER COMMA
NCBIEAA TEXT COMMA
SNCBIEAA TEXT
CLOSE
{
$$ = malloc(sizeof(GeneticCodeStruct));
if($$==NULL) exit(EXIT_FAILURE);
$$->name1=$3;
$$->name2=$5;
$$->id=$7;
$$->ncbieaa=$10;
$$->sncbieaa=$13;
}
;
optional_name:/* nothing */
{
$$=NULL;
}
| NAME TEXT COMMA
{
$$=$2;
}
%%
int main(int argc,char** argv)
{
yyparse();
return 0;
}
Compiling:#include <stdio.h>
#include <stdlib.h>
#include "gc.h"
%}
%union {
char* s;
int integer;
GeneticCode gCode;
}
%name-prefix="gc_"
%defines
%error-verbose
%token GENETIC_CODE_TABLE NAME OPEN CLOSE COMMA ASSIGN NCBIEAA SNCBIEAA ID
%token<integer> INTEGER
%token<s> TEXT
%type<gCode> code
%type<gCode> codes
%type<s> optional_name
%start input
%{
void yyerror(const char* message)
{
fprintf(stderr,"ERROR:%s\n",message);
}
%}
%%
input: GENETIC_CODE_TABLE ASSIGN OPEN codes CLOSE
{
GeneticCode gc=$4;
fputs("Genetic-code-table ::= {",stdout);
while(gc!=NULL)
{
printf("{name \"%s\",", gc->name1);
if(gc->name2!=NULL) printf("name \"%s\",", gc->name2);
printf("id %d,ncbieaa \"%s\",sncbieaa \"%s\"}",
gc->id, gc->ncbieaa, gc->sncbieaa
);
if(gc->next!=NULL) fputc(',',stdout);
gc=gc->next;
}
fputc('}',stdout);
/** free memory here.... */
};
codes: code
{
$$=$1;
}
| codes COMMA code
{
$$=$3;
$$->next=$1;
}
;
code: OPEN
NAME TEXT COMMA
optional_name
ID INTEGER COMMA
NCBIEAA TEXT COMMA
SNCBIEAA TEXT
CLOSE
{
$$ = malloc(sizeof(GeneticCodeStruct));
if($$==NULL) exit(EXIT_FAILURE);
$$->name1=$3;
$$->name2=$5;
$$->id=$7;
$$->ncbieaa=$10;
$$->sncbieaa=$13;
}
;
optional_name:/* nothing */
{
$$=NULL;
}
| NAME TEXT COMMA
{
$$=$2;
}
%%
int main(int argc,char** argv)
{
yyparse();
return 0;
}
flex gc.l
bison gc.y
gcc -o a.out gc.tab.c lex.gc_.c
Testing:bison gc.y
gcc -o a.out gc.tab.c lex.gc_.c
cat gc.ptr |./a.out | ./a.out |./a.out |./a.out
Genetic-code-table ::= {{name "Standard",name "SGC0",id 1,ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",sncbieaa "---M---------------M---------------M----------------------------"},
{name "Vertebrate Mitochondrial",name "SGC1",id 2,ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",sncbieaa "--------------------------------MMMM---------------M------------"},
(...)
,{name "Thraustochytrium Mitochondrial",id 23,ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",sncbieaa "--------------------------------M--M---------------M------------"}}
Genetic-code-table ::= {{name "Standard",name "SGC0",id 1,ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",sncbieaa "---M---------------M---------------M----------------------------"},
{name "Vertebrate Mitochondrial",name "SGC1",id 2,ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",sncbieaa "--------------------------------MMMM---------------M------------"},
(...)
,{name "Thraustochytrium Mitochondrial",id 23,ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",sncbieaa "--------------------------------M--M---------------M------------"}}
That's it.
Pierre
In the next post I'll write a reentrant parser using flex and bison.
No comments:
Post a Comment