14 December 2009

Parsing a genetic code using flex and bison. Part 2/2

(this post was inspired from this tutorial: "Writing a Reentrant Parser with Flex and Bison").
In the previous post I've shown how to parse a NCBI genetic code in ASN.1 using flex and bison. However the generated code is non reentrant that is to say that some globals variables prevent it to be used in a concurrent environment.

If we want to use this code in a multithread environment, some changes must be added: first we need a new structure GCState where the state of the lexer will be saved between each time bison requests for a new token:
typedef struct gcState
{
/** reference to the flex lexer */
void* handle_to_the_scanner;
}GCStateStruct,*GCState;
Your browser does not support the <CANVAS> element !

The Lexer

In the lexer some options must be added: we tell Flex to create a reentrant parser and that the yylex function generated by flex takes an extra argument yylval
%option bison-bridge
%option reentrant
Prior to be used, this context 'yyscan_t' has to be initialized:
void gcParse(FILE* in)
{
GCStateStruct ctx;
memset(&ctx,0,sizeof(GCStateStruct));
yyscan_t scanner;//structure created by flex

if(yylex_init_extra(&ctx,&scanner)!=0)//initialize the scanner
{
return;
}
ctx.handle_to_the_scanner=&scanner;//will tell bison where is the lexer
yyset_in (in,scanner );//change the input stream

gc_parse(ctx);//generated by bison
yylex_destroy(scanner);
}
.The for each method in yylex, a reference to the state in GCState is added:
[0-9]+ {yyget_lval(yyscanner)->integer=atoi(yyget_text(yyscanner)) ;return INTEGER;}
\"[^\"]+\" {
yyget_lval(yyscanner)->s= malloc(yyget_leng(yyscanner)-1);
if(yyget_lval(yyscanner)->s==NULL) exit(EXIT_FAILURE);
strncpy(yyget_lval(yyscanner)->s,&yytext[1],yyget_leng(yyscanner)-2);
return TEXT;

The Parser

We tell bison that we want a reentrant parser
%pure-parser
, each time the lexer will be asked for a new token, we want to pass an extra GCState parameter:
%parse-param{GCState state}
. We also tell bison that the yylex function generated by flex takes an extra parameter hoding the state of the scanner:
%lex-param{void* handle_to_the_scanner}
.A #define is added to tell bison how to extract the handle to the scanner from the GCState:
#define handle_to_the_scanner (state->handle_to_the_scanner)

All in one

Here is the lexer gc.l:
%option prefix="gc_"
%option noyywrap
%option bison-bridge
%option reentrant

%{
#include <stdio.h>
#include <stdlib.h>
#include "gc.h"
#include "gc.tab.h" //generated by bison
%}

%%
\-\-.*\n ;/* ignore comments */

Genetic\-code\-table return GENETIC_CODE_TABLE;
name return NAME;
id return ID;
ncbieaa return NCBIEAA;
sncbieaa return SNCBIEAA;
[0-9]+ {yyget_lval(yyscanner)->integer=atoi(yyget_text(yyscanner)) ;return INTEGER;}
\:\:= return ASSIGN;
, return COMMA;
\{ return OPEN;
\} return CLOSE;
\"[^\"]+\" {
yyget_lval(yyscanner)->s= malloc(yyget_leng(yyscanner)-1);
if(yyget_lval(yyscanner)->s==NULL) exit(EXIT_FAILURE);
strncpy(yyget_lval(yyscanner)->s,&yytext[1],yyget_leng(yyscanner)-2);
return TEXT;
}
[\n\t ] ;//ignore blanks
. return yytext[0];

%%

void gcParse(FILE* in)
{
GCStateStruct ctx;
memset(&ctx,0,sizeof(GCStateStruct));
yyscan_t scanner;

if(yylex_init_extra(&ctx,&scanner)!=0)
{
return;
}
ctx.handle_to_the_scanner=&scanner;
yyset_in (in,scanner );
yyset_debug(1,scanner);
gc_parse(ctx);
yylex_destroy(scanner);
}
Here is the parser gc.y:

%{
#include <stdio.h>
#include <stdlib.h>
#include "gc.h"
%}

%union {
char* s;
int integer;
GeneticCode gCode;
}

%pure-parser
%name-prefix="gc_"
%defines
%error-verbose
%parse-param{GCState state}
%lex-param{void* handle_to_the_scanner}

%token GENETIC_CODE_TABLE NAME OPEN CLOSE COMMA ASSIGN NCBIEAA SNCBIEAA ID
%token<integer> INTEGER
%token<s> TEXT

%type<gCode> code
%type<gCode> codes
%type<s> optional_name
%start input

%{
void yyerror(GCState state,const char* message)
{
fprintf(stderr,"ERROR:%s\n",message);
}
#define handle_to_the_scanner (state->handle_to_the_scanner)

%}

%%

input: GENETIC_CODE_TABLE ASSIGN OPEN codes CLOSE
{
GeneticCode gc=$4;
fputs("Genetic-code-table ::= {",stdout);
while(gc!=NULL)
{
printf("{name \"%s\",", gc->name1);
if(gc->name2!=NULL) printf("name \"%s\",", gc->name2);
printf("id %d,ncbieaa \"%s\",sncbieaa \"%s\"}",
gc->id, gc->ncbieaa, gc->sncbieaa
);
if(gc->next!=NULL) fputc(',',stdout);
gc=gc->next;
}
fputc('}',stdout);
/** free memory here.... */
};

codes: code
{
$$=$1;
}
| codes COMMA code
{
$$=$3;
$$->next=$1;
}
;


code: OPEN
NAME TEXT COMMA
optional_name
ID INTEGER COMMA
NCBIEAA TEXT COMMA
SNCBIEAA TEXT
CLOSE
{
$$ = malloc(sizeof(GeneticCodeStruct));
if($$==NULL) exit(EXIT_FAILURE);
$$->name1=$3;
$$->name2=$5;
$$->id=$7;
$$->ncbieaa=$10;
$$->sncbieaa=$13;
}
;
optional_name:/* nothing */
{
$$=NULL;
}
| NAME TEXT COMMA
{
$$=$2;
}
%%

extern void gcParse(FILE* in);

int main(int argc,char** argv)
{
gcParse(stdin);
return 0;
}
and the file gc.h
#ifndef GENETIC_CODE_H
#define GENETIC_CODE_H

typedef struct geneticCode
{
char* name1;
char* name2;
int id;
char* ncbieaa;
char* sncbieaa;
struct geneticCode *next;
}GeneticCodeStruct,*GeneticCode;


typedef struct gcState
{
/** reference to the flex lexer */
void* handle_to_the_scanner;
}GCStateStruct,*GCState;

#endif
Compiling...
flex gc.l
bison gc.y
gcc gc.tab.c lex.gc_.c
Testing:
cat gc.ptr |./a.out | ./a.out |./a.out |./a.out

Genetic-code-table ::= {{name "Standard",name "SGC0",id 1,ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",sncbieaa "---M---------------M---------------M----------------------------"},
{name "Vertebrate Mitochondrial",name "SGC1",id 2,ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",sncbieaa "--------------------------------MMMM---------------M------------"},
(...)
,{name "Thraustochytrium Mitochondrial",id 23,ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",sncbieaa "--------------------------------M--M---------------M------------"}}

That's it.
Pierre


No comments: