Parsing a genetic code using flex and bison. Part 2/2
(this post was inspired from this tutorial: "Writing a Reentrant Parser with Flex and Bison").
In the previous post I've shown how to parse a NCBI genetic code in ASN.1 using flex and bison. However the generated code is non reentrant that is to say that some globals variables prevent it to be used in a concurrent environment.
If we want to use this code in a multithread environment, some changes must be added: first we need a new structure GCState where the state of the lexer will be saved between each time bison requests for a new token:
%{
#include <stdio.h>
#include <stdlib.h>
#include "gc.h"
%}
%union {
char* s;
int integer;
GeneticCode gCode;
}
%pure-parser
%name-prefix="gc_"
%defines
%error-verbose
%parse-param{GCState state}
%lex-param{void* handle_to_the_scanner}
%token GENETIC_CODE_TABLE NAME OPEN CLOSE COMMA ASSIGN NCBIEAA SNCBIEAA ID
%token<integer> INTEGER
%token<s> TEXT
%type<gCode> code
%type<gCode> codes
%type<s> optional_name
%start input
%{
void yyerror(GCState state,const char* message)
{
fprintf(stderr,"ERROR:%s\n",message);
}
#define handle_to_the_scanner (state->handle_to_the_scanner)
%}
%%
input: GENETIC_CODE_TABLE ASSIGN OPEN codes CLOSE
{
GeneticCode gc=$4;
fputs("Genetic-code-table ::= {",stdout);
while(gc!=NULL)
{
printf("{name \"%s\",", gc->name1);
if(gc->name2!=NULL) printf("name \"%s\",", gc->name2);
printf("id %d,ncbieaa \"%s\",sncbieaa \"%s\"}",
gc->id, gc->ncbieaa, gc->sncbieaa
);
if(gc->next!=NULL) fputc(',',stdout);
gc=gc->next;
}
fputc('}',stdout);
/** free memory here.... */
};
codes: code
{
$$=$1;
}
| codes COMMA code
{
$$=$3;
$$->next=$1;
}
;
code: OPEN
NAME TEXT COMMA
optional_name
ID INTEGER COMMA
NCBIEAA TEXT COMMA
SNCBIEAA TEXT
CLOSE
{
$$ = malloc(sizeof(GeneticCodeStruct));
if($$==NULL) exit(EXIT_FAILURE);
$$->name1=$3;
$$->name2=$5;
$$->id=$7;
$$->ncbieaa=$10;
$$->sncbieaa=$13;
}
;
optional_name:/* nothing */
{
$$=NULL;
}
| NAME TEXT COMMA
{
$$=$2;
}
%%
extern void gcParse(FILE* in);
int main(int argc,char** argv)
{
gcParse(stdin);
return 0;
} and the file gc.h
That's it.
Pierre
In the previous post I've shown how to parse a NCBI genetic code in ASN.1 using flex and bison. However the generated code is non reentrant that is to say that some globals variables prevent it to be used in a concurrent environment.
If we want to use this code in a multithread environment, some changes must be added: first we need a new structure GCState where the state of the lexer will be saved between each time bison requests for a new token:
typedef struct gcState
{
/** reference to the flex lexer */
void* handle_to_the_scanner;
}GCStateStruct,*GCState;
{
/** reference to the flex lexer */
void* handle_to_the_scanner;
}GCStateStruct,*GCState;
The Lexer
In the lexer some options must be added: we tell Flex to create a reentrant parser and that the yylex function generated by flex takes an extra argument yylval%option bison-bridge
%option reentrant
Prior to be used, this context 'yyscan_t' has to be initialized:%option reentrant
void gcParse(FILE* in)
{
GCStateStruct ctx;
memset(&ctx,0,sizeof(GCStateStruct));
yyscan_t scanner;//structure created by flex
if(yylex_init_extra(&ctx,&scanner)!=0)//initialize the scanner
{
return;
}
ctx.handle_to_the_scanner=&scanner;//will tell bison where is the lexer
yyset_in (in,scanner );//change the input stream
gc_parse(ctx);//generated by bison
yylex_destroy(scanner);
}
.The for each method in yylex, a reference to the state in GCState is added:{
GCStateStruct ctx;
memset(&ctx,0,sizeof(GCStateStruct));
yyscan_t scanner;//structure created by flex
if(yylex_init_extra(&ctx,&scanner)!=0)//initialize the scanner
{
return;
}
ctx.handle_to_the_scanner=&scanner;//will tell bison where is the lexer
yyset_in (in,scanner );//change the input stream
gc_parse(ctx);//generated by bison
yylex_destroy(scanner);
}
[0-9]+ {yyget_lval(yyscanner)->integer=atoi(yyget_text(yyscanner)) ;return INTEGER;}
\"[^\"]+\" {
yyget_lval(yyscanner)->s= malloc(yyget_leng(yyscanner)-1);
if(yyget_lval(yyscanner)->s==NULL) exit(EXIT_FAILURE);
strncpy(yyget_lval(yyscanner)->s,&yytext[1],yyget_leng(yyscanner)-2);
return TEXT;
\"[^\"]+\" {
yyget_lval(yyscanner)->s= malloc(yyget_leng(yyscanner)-1);
if(yyget_lval(yyscanner)->s==NULL) exit(EXIT_FAILURE);
strncpy(yyget_lval(yyscanner)->s,&yytext[1],yyget_leng(yyscanner)-2);
return TEXT;
The Parser
We tell bison that we want a reentrant parser%pure-parser
, each time the lexer will be asked for a new token, we want to pass an extra GCState parameter:%parse-param{GCState state}
. We also tell bison that the yylex function generated by flex takes an extra parameter hoding the state of the scanner:%lex-param{void* handle_to_the_scanner}
.A #define
is added to tell bison how to extract the handle to the scanner from the GCState:#define handle_to_the_scanner (state->handle_to_the_scanner)
All in one
Here is the lexer gc.l:%option prefix="gc_"
%option noyywrap
%option bison-bridge
%option reentrant
%{
#include <stdio.h>
#include <stdlib.h>
#include "gc.h"
#include "gc.tab.h" //generated by bison
%}
%%
\-\-.*\n ;/* ignore comments */
Genetic\-code\-table return GENETIC_CODE_TABLE;
name return NAME;
id return ID;
ncbieaa return NCBIEAA;
sncbieaa return SNCBIEAA;
[0-9]+ {yyget_lval(yyscanner)->integer=atoi(yyget_text(yyscanner)) ;return INTEGER;}
\:\:= return ASSIGN;
, return COMMA;
\{ return OPEN;
\} return CLOSE;
\"[^\"]+\" {
yyget_lval(yyscanner)->s= malloc(yyget_leng(yyscanner)-1);
if(yyget_lval(yyscanner)->s==NULL) exit(EXIT_FAILURE);
strncpy(yyget_lval(yyscanner)->s,&yytext[1],yyget_leng(yyscanner)-2);
return TEXT;
}
[\n\t ] ;//ignore blanks
. return yytext[0];
%%
void gcParse(FILE* in)
{
GCStateStruct ctx;
memset(&ctx,0,sizeof(GCStateStruct));
yyscan_t scanner;
if(yylex_init_extra(&ctx,&scanner)!=0)
{
return;
}
ctx.handle_to_the_scanner=&scanner;
yyset_in (in,scanner );
yyset_debug(1,scanner);
gc_parse(ctx);
yylex_destroy(scanner);
}
Here is the parser gc.y:%option noyywrap
%option bison-bridge
%option reentrant
%{
#include <stdio.h>
#include <stdlib.h>
#include "gc.h"
#include "gc.tab.h" //generated by bison
%}
%%
\-\-.*\n ;/* ignore comments */
Genetic\-code\-table return GENETIC_CODE_TABLE;
name return NAME;
id return ID;
ncbieaa return NCBIEAA;
sncbieaa return SNCBIEAA;
[0-9]+ {yyget_lval(yyscanner)->integer=atoi(yyget_text(yyscanner)) ;return INTEGER;}
\:\:= return ASSIGN;
, return COMMA;
\{ return OPEN;
\} return CLOSE;
\"[^\"]+\" {
yyget_lval(yyscanner)->s= malloc(yyget_leng(yyscanner)-1);
if(yyget_lval(yyscanner)->s==NULL) exit(EXIT_FAILURE);
strncpy(yyget_lval(yyscanner)->s,&yytext[1],yyget_leng(yyscanner)-2);
return TEXT;
}
[\n\t ] ;//ignore blanks
. return yytext[0];
%%
void gcParse(FILE* in)
{
GCStateStruct ctx;
memset(&ctx,0,sizeof(GCStateStruct));
yyscan_t scanner;
if(yylex_init_extra(&ctx,&scanner)!=0)
{
return;
}
ctx.handle_to_the_scanner=&scanner;
yyset_in (in,scanner );
yyset_debug(1,scanner);
gc_parse(ctx);
yylex_destroy(scanner);
}
%{
#include <stdio.h>
#include <stdlib.h>
#include "gc.h"
%}
%union {
char* s;
int integer;
GeneticCode gCode;
}
%pure-parser
%name-prefix="gc_"
%defines
%error-verbose
%parse-param{GCState state}
%lex-param{void* handle_to_the_scanner}
%token GENETIC_CODE_TABLE NAME OPEN CLOSE COMMA ASSIGN NCBIEAA SNCBIEAA ID
%token<integer> INTEGER
%token<s> TEXT
%type<gCode> code
%type<gCode> codes
%type<s> optional_name
%start input
%{
void yyerror(GCState state,const char* message)
{
fprintf(stderr,"ERROR:%s\n",message);
}
#define handle_to_the_scanner (state->handle_to_the_scanner)
%}
%%
input: GENETIC_CODE_TABLE ASSIGN OPEN codes CLOSE
{
GeneticCode gc=$4;
fputs("Genetic-code-table ::= {",stdout);
while(gc!=NULL)
{
printf("{name \"%s\",", gc->name1);
if(gc->name2!=NULL) printf("name \"%s\",", gc->name2);
printf("id %d,ncbieaa \"%s\",sncbieaa \"%s\"}",
gc->id, gc->ncbieaa, gc->sncbieaa
);
if(gc->next!=NULL) fputc(',',stdout);
gc=gc->next;
}
fputc('}',stdout);
/** free memory here.... */
};
codes: code
{
$$=$1;
}
| codes COMMA code
{
$$=$3;
$$->next=$1;
}
;
code: OPEN
NAME TEXT COMMA
optional_name
ID INTEGER COMMA
NCBIEAA TEXT COMMA
SNCBIEAA TEXT
CLOSE
{
$$ = malloc(sizeof(GeneticCodeStruct));
if($$==NULL) exit(EXIT_FAILURE);
$$->name1=$3;
$$->name2=$5;
$$->id=$7;
$$->ncbieaa=$10;
$$->sncbieaa=$13;
}
;
optional_name:/* nothing */
{
$$=NULL;
}
| NAME TEXT COMMA
{
$$=$2;
}
%%
extern void gcParse(FILE* in);
int main(int argc,char** argv)
{
gcParse(stdin);
return 0;
}
#ifndef GENETIC_CODE_H
#define GENETIC_CODE_H
typedef struct geneticCode
{
char* name1;
char* name2;
int id;
char* ncbieaa;
char* sncbieaa;
struct geneticCode *next;
}GeneticCodeStruct,*GeneticCode;
typedef struct gcState
{
/** reference to the flex lexer */
void* handle_to_the_scanner;
}GCStateStruct,*GCState;
#endif
Compiling...#define GENETIC_CODE_H
typedef struct geneticCode
{
char* name1;
char* name2;
int id;
char* ncbieaa;
char* sncbieaa;
struct geneticCode *next;
}GeneticCodeStruct,*GeneticCode;
typedef struct gcState
{
/** reference to the flex lexer */
void* handle_to_the_scanner;
}GCStateStruct,*GCState;
#endif
flex gc.l
bison gc.y
gcc gc.tab.c lex.gc_.c
Testing:bison gc.y
gcc gc.tab.c lex.gc_.c
cat gc.ptr |./a.out | ./a.out |./a.out |./a.out
Genetic-code-table ::= {{name "Standard",name "SGC0",id 1,ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",sncbieaa "---M---------------M---------------M----------------------------"},
{name "Vertebrate Mitochondrial",name "SGC1",id 2,ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",sncbieaa "--------------------------------MMMM---------------M------------"},
(...)
,{name "Thraustochytrium Mitochondrial",id 23,ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",sncbieaa "--------------------------------M--M---------------M------------"}}
Genetic-code-table ::= {{name "Standard",name "SGC0",id 1,ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",sncbieaa "---M---------------M---------------M----------------------------"},
{name "Vertebrate Mitochondrial",name "SGC1",id 2,ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",sncbieaa "--------------------------------MMMM---------------M------------"},
(...)
,{name "Thraustochytrium Mitochondrial",id 23,ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",sncbieaa "--------------------------------M--M---------------M------------"}}
That's it.
Pierre
No comments:
Post a Comment