30 March 2011

Parsing a genomic position with javacc

Parsing a genomic position (chrom:start-end) is an easy task but I've always been too lazy to create a library for this. Today I wrote a Java-CC-based parser for analyzing the various syntaxes of a genomic position. Here is the grammar I used:

COMMA: ","
LETTER: (["a"-"z"]|["A"-"Z"]|"_") ;
DIGIT: ["0"-"9"];
INT:<DIGIT> ( (<DIGIT>|<COMMA>)* <DIGIT>)? ;
BP: "b" ("p")? ;
KB: ("k") ("B")? ;
MB: ("m") ("B")? ;
GB: ("g") ("B")? ;
IDENTIFIER: <LETTER> (<DIGIT>|<LETTER>)* ;
COLON: ":" ;
DASH: "-" ;
PLUS: "+" ;
DELIM: ("|"|";") ;


java.util.List<Segment> many(): segment() ((<DELIM>)? segment() )* )? <EOF>);
Segment one(): segment() <EOF>;
Segment segment(): chromName() <COLON> position() (<DASH> position()| <PLUS> position())? );
BigInteger position():integer() (factor())?;
BigInteger factor(): ( <BP> | <KB>| <MB> | <GB> );
BigInteger integer():<INT> ;
String chromName():( integer() | identifier());
String identifier(): <IDENTIFIER> ;

Source code


options {
static=false;
DEBUG_PARSER=false;
IGNORE_CASE=true;
}
PARSER_BEGIN(SegmentParser)
/**
* Author:
* Pierre Lindenbaum PhD
* WWW
* http://plindenbaum.blogspot.com
* Motivation
* parsing a genomic fragment (chr:start-end)
* Compilation
* javacc SegmentParser.jj
* javac -Xlint:unchecked SegmentParser.java
* echo " chrM:1-100,000"| java SegmentParser
*/
import java.math.BigInteger;
class Segment
{
String chrom;
int start;
int end;
public Segment(String chrom,int start,int end)
{
this.chrom=chrom;
this.start=start;
this.end=end;
}
public String toString()
{
return chrom+":"+start+"-"+end;
}
}
public class SegmentParser
{
private static final BigInteger TWO=new BigInteger("2");
private static final BigInteger INT_MAX=new BigInteger(String.valueOf(Integer.MAX_VALUE));
public static Segment parseOne(String s)
{
try
{
SegmentParser parser = new SegmentParser(new java.io.StringReader(s));
return parser.one();
}
catch(ParseException err)
{
throw new IllegalArgumentException(err);
}
}
public static java.util.List<Segment> parseMany(String s)
{
try
{
SegmentParser parser = new SegmentParser(new java.io.StringReader(s));
return parser.many();
}
catch(ParseException err)
{
throw new IllegalArgumentException(err);
}
}
public static void main(String args[])
{
try
{
SegmentParser parser = new SegmentParser(System.in);
System.out.println(parser.one());
}
catch(Throwable err)
{
err.printStackTrace();
}
}
}
PARSER_END(SegmentParser)
SKIP:{
" "|"\n"
}
TOKEN:
{
<#COMMA: ",">
| <#LETTER:(["a"-"z"]|["A"-"Z"]|"_")>
| <#DIGIT:["0"-"9"]>
| <INT:<DIGIT> ( (<DIGIT>|<COMMA>)* <DIGIT>)? >
| <BP: "b" ("p")?>
| <KB: ("k") ("B")?>
| <MB: ("m") ("B")?>
| <GB: ("g") ("B")?>
| <IDENTIFIER: <LETTER> (<DIGIT>|<LETTER>)* >
| <COLON: ":" >
| <DASH: "-" >
| <PLUS: "+" >
| <DELIM: ("|"|";") >
}
public java.util.List<Segment> many():
{
java.util.List<Segment> L=new java.util.ArrayList<Segment>();
Segment s1;
Segment s2;
}
{
( (s1=segment() { L.add(s1);} ((<DELIM>)? s2=segment() { L.add(s2);} )* )? <EOF>)
{
return L;
}
}
public Segment one():{Segment pos; }
{
( pos=segment() <EOF>)
{
return pos;
}
}
private Segment segment():{String chrom;BigInteger start=null; BigInteger end=null ; char sign='?';}
{
( chrom=chromName() <COLON> start= position() (<DASH> end=position() {sign='-';}| <PLUS> end=position() {sign='+';})? )
{
switch(sign)
{
case '?': end=start.add(BigInteger.ONE);break;
case '-': break;
case '+': start=start.subtract(end);end=BigInteger.ONE.add(start.add(end.multiply(TWO)));break;
}
if(start.compareTo(BigInteger.ZERO)<0) throw new ParseException(start.toString()+" < 0)");
if(end.compareTo(start)<0) throw new ParseException(start.toString()+" > "+end+")");
if(INT_MAX.compareTo(start)<=0) throw new ParseException(start.toString()+" > "+ INT_MAX);
if(INT_MAX.compareTo(end)<=0) throw new ParseException(end.toString()+" > "+ INT_MAX);
return new Segment(chrom,start.intValue(),end.intValue());
}
}
private BigInteger position():{BigInteger i=null; BigInteger mul=BigInteger.ONE;}
{
i=integer() (mul=factor())? { return i.multiply(mul);}
}
private BigInteger factor():{}
{
(
<BP> { return new BigInteger("1");}
| <KB> { return new BigInteger("1000");}
| <MB> { return new BigInteger("1000000");}
| <GB> { return new BigInteger("1000000000");}
)
}
private BigInteger integer():{Token t;}
{
t=<INT> { return new BigInteger(t.image.replace(",","")); }
}
private String chromName():{BigInteger i; String s;}
{
( i=integer() { return "chr"+i.toString(); }
| s=identifier() { return s;}
)
}
private String identifier():{Token t;}
{
t=<IDENTIFIER> { return t.image; }
}

Compiling

javacc SegmentParser.jj
javac SegmentParser.java

Running

echo " chrM:1-100,000"| java SegmentParser
chrM:1-100000
echo " c1:1000"| java SegmentParser
c1:1000-1001
echo "2:1Gb+1 " | java SegmentParser
chr2:999999999-1000000002
echo "chr2:10+100" | java SegmentParser
ParseException: -90 < 0)
echo "chrX:3147483647" | java SegmentParser
ParseException: 3147483647 > 2147483647 (int-max)
echo "2:1Gb+a azd " | java SegmentParser
ParseException: Encountered "a" at line 1, column 7


That's it,

Pierre

No comments: