Parsing a genomic position with javacc
Parsing a genomic position (chrom:start-end)
is an easy task but I've always been too lazy to create a library for this. Today I wrote a Java-CC-based parser for analyzing the various syntaxes of a genomic position. Here is the grammar I used:
LETTER: (["a"-"z"]|["A"-"Z"]|"_") ;
DIGIT: ["0"-"9"];
INT:<DIGIT> ( (<DIGIT>|<COMMA>)* <DIGIT>)? ;
BP: "b" ("p")? ;
KB: ("k") ("B")? ;
MB: ("m") ("B")? ;
GB: ("g") ("B")? ;
IDENTIFIER: <LETTER> (<DIGIT>|<LETTER>)* ;
COLON: ":" ;
DASH: "-" ;
PLUS: "+" ;
DELIM: ("|"|";") ;
java.util.List<Segment> many(): segment() ((<DELIM>)? segment() )* )? <EOF>);
Segment one(): segment() <EOF>;
Segment segment(): chromName() <COLON> position() (<DASH> position()| <PLUS> position())? );
BigInteger position():integer() (factor())?;
BigInteger factor(): ( <BP> | <KB>| <MB> | <GB> );
BigInteger integer():<INT> ;
String chromName():( integer() | identifier());
String identifier(): <IDENTIFIER> ;
Source code
options { | |
static=false; | |
DEBUG_PARSER=false; | |
IGNORE_CASE=true; | |
} | |
PARSER_BEGIN(SegmentParser) | |
/** | |
* Author: | |
* Pierre Lindenbaum PhD | |
* WWW | |
* http://plindenbaum.blogspot.com | |
* Motivation | |
* parsing a genomic fragment (chr:start-end) | |
* Compilation | |
* javacc SegmentParser.jj | |
* javac -Xlint:unchecked SegmentParser.java | |
* echo " chrM:1-100,000"| java SegmentParser | |
*/ | |
import java.math.BigInteger; | |
class Segment | |
{ | |
String chrom; | |
int start; | |
int end; | |
public Segment(String chrom,int start,int end) | |
{ | |
this.chrom=chrom; | |
this.start=start; | |
this.end=end; | |
} | |
public String toString() | |
{ | |
return chrom+":"+start+"-"+end; | |
} | |
} | |
public class SegmentParser | |
{ | |
private static final BigInteger TWO=new BigInteger("2"); | |
private static final BigInteger INT_MAX=new BigInteger(String.valueOf(Integer.MAX_VALUE)); | |
public static Segment parseOne(String s) | |
{ | |
try | |
{ | |
SegmentParser parser = new SegmentParser(new java.io.StringReader(s)); | |
return parser.one(); | |
} | |
catch(ParseException err) | |
{ | |
throw new IllegalArgumentException(err); | |
} | |
} | |
public static java.util.List<Segment> parseMany(String s) | |
{ | |
try | |
{ | |
SegmentParser parser = new SegmentParser(new java.io.StringReader(s)); | |
return parser.many(); | |
} | |
catch(ParseException err) | |
{ | |
throw new IllegalArgumentException(err); | |
} | |
} | |
public static void main(String args[]) | |
{ | |
try | |
{ | |
SegmentParser parser = new SegmentParser(System.in); | |
System.out.println(parser.one()); | |
} | |
catch(Throwable err) | |
{ | |
err.printStackTrace(); | |
} | |
} | |
} | |
PARSER_END(SegmentParser) | |
SKIP:{ | |
" "|"\n" | |
} | |
TOKEN: | |
{ | |
<#COMMA: ","> | |
| <#LETTER:(["a"-"z"]|["A"-"Z"]|"_")> | |
| <#DIGIT:["0"-"9"]> | |
| <INT:<DIGIT> ( (<DIGIT>|<COMMA>)* <DIGIT>)? > | |
| <BP: "b" ("p")?> | |
| <KB: ("k") ("B")?> | |
| <MB: ("m") ("B")?> | |
| <GB: ("g") ("B")?> | |
| <IDENTIFIER: <LETTER> (<DIGIT>|<LETTER>)* > | |
| <COLON: ":" > | |
| <DASH: "-" > | |
| <PLUS: "+" > | |
| <DELIM: ("|"|";") > | |
} | |
public java.util.List<Segment> many(): | |
{ | |
java.util.List<Segment> L=new java.util.ArrayList<Segment>(); | |
Segment s1; | |
Segment s2; | |
} | |
{ | |
( (s1=segment() { L.add(s1);} ((<DELIM>)? s2=segment() { L.add(s2);} )* )? <EOF>) | |
{ | |
return L; | |
} | |
} | |
public Segment one():{Segment pos; } | |
{ | |
( pos=segment() <EOF>) | |
{ | |
return pos; | |
} | |
} | |
private Segment segment():{String chrom;BigInteger start=null; BigInteger end=null ; char sign='?';} | |
{ | |
( chrom=chromName() <COLON> start= position() (<DASH> end=position() {sign='-';}| <PLUS> end=position() {sign='+';})? ) | |
{ | |
switch(sign) | |
{ | |
case '?': end=start.add(BigInteger.ONE);break; | |
case '-': break; | |
case '+': start=start.subtract(end);end=BigInteger.ONE.add(start.add(end.multiply(TWO)));break; | |
} | |
if(start.compareTo(BigInteger.ZERO)<0) throw new ParseException(start.toString()+" < 0)"); | |
if(end.compareTo(start)<0) throw new ParseException(start.toString()+" > "+end+")"); | |
if(INT_MAX.compareTo(start)<=0) throw new ParseException(start.toString()+" > "+ INT_MAX); | |
if(INT_MAX.compareTo(end)<=0) throw new ParseException(end.toString()+" > "+ INT_MAX); | |
return new Segment(chrom,start.intValue(),end.intValue()); | |
} | |
} | |
private BigInteger position():{BigInteger i=null; BigInteger mul=BigInteger.ONE;} | |
{ | |
i=integer() (mul=factor())? { return i.multiply(mul);} | |
} | |
private BigInteger factor():{} | |
{ | |
( | |
<BP> { return new BigInteger("1");} | |
| <KB> { return new BigInteger("1000");} | |
| <MB> { return new BigInteger("1000000");} | |
| <GB> { return new BigInteger("1000000000");} | |
) | |
} | |
private BigInteger integer():{Token t;} | |
{ | |
t=<INT> { return new BigInteger(t.image.replace(",","")); } | |
} | |
private String chromName():{BigInteger i; String s;} | |
{ | |
( i=integer() { return "chr"+i.toString(); } | |
| s=identifier() { return s;} | |
) | |
} | |
private String identifier():{Token t;} | |
{ | |
t=<IDENTIFIER> { return t.image; } | |
} |
Compiling
javac SegmentParser.java
Running
chrM:1-100000
echo " c1:1000"| java SegmentParser
c1:1000-1001
echo "2:1Gb+1 " | java SegmentParser
chr2:999999999-1000000002
echo "chr2:10+100" | java SegmentParser
ParseException: -90 < 0)
echo "chrX:3147483647" | java SegmentParser
ParseException: 3147483647 > 2147483647 (int-max)
echo "2:1Gb+a azd " | java SegmentParser
ParseException: Encountered "a" at line 1, column 7
That's it,
Pierre