Parsing a genomic position with javacc
Parsing a genomic position (chrom:start-end)
is an easy task but I've always been too lazy to create a library for this. Today I wrote a Java-CC-based parser for analyzing the various syntaxes of a genomic position. Here is the grammar I used:
COMMA: ","
LETTER: (["a"-"z"]|["A"-"Z"]|"_") ;
DIGIT: ["0"-"9"];
INT:<DIGIT> ( (<DIGIT>|<COMMA>)* <DIGIT>)? ;
BP: "b" ("p")? ;
KB: ("k") ("B")? ;
MB: ("m") ("B")? ;
GB: ("g") ("B")? ;
IDENTIFIER: <LETTER> (<DIGIT>|<LETTER>)* ;
COLON: ":" ;
DASH: "-" ;
PLUS: "+" ;
DELIM: ("|"|";") ;
java.util.List<Segment> many(): segment() ((<DELIM>)? segment() )* )? <EOF>);
Segment one(): segment() <EOF>;
Segment segment(): chromName() <COLON> position() (<DASH> position()| <PLUS> position())? );
BigInteger position():integer() (factor())?;
BigInteger factor(): ( <BP> | <KB>| <MB> | <GB> );
BigInteger integer():<INT> ;
String chromName():( integer() | identifier());
String identifier(): <IDENTIFIER> ;
LETTER: (["a"-"z"]|["A"-"Z"]|"_") ;
DIGIT: ["0"-"9"];
INT:<DIGIT> ( (<DIGIT>|<COMMA>)* <DIGIT>)? ;
BP: "b" ("p")? ;
KB: ("k") ("B")? ;
MB: ("m") ("B")? ;
GB: ("g") ("B")? ;
IDENTIFIER: <LETTER> (<DIGIT>|<LETTER>)* ;
COLON: ":" ;
DASH: "-" ;
PLUS: "+" ;
DELIM: ("|"|";") ;
java.util.List<Segment> many(): segment() ((<DELIM>)? segment() )* )? <EOF>);
Segment one(): segment() <EOF>;
Segment segment(): chromName() <COLON> position() (<DASH> position()| <PLUS> position())? );
BigInteger position():integer() (factor())?;
BigInteger factor(): ( <BP> | <KB>| <MB> | <GB> );
BigInteger integer():<INT> ;
String chromName():( integer() | identifier());
String identifier(): <IDENTIFIER> ;
Source code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
options { | |
static=false; | |
DEBUG_PARSER=false; | |
IGNORE_CASE=true; | |
} | |
PARSER_BEGIN(SegmentParser) | |
/** | |
* Author: | |
* Pierre Lindenbaum PhD | |
* WWW | |
* http://plindenbaum.blogspot.com | |
* Motivation | |
* parsing a genomic fragment (chr:start-end) | |
* Compilation | |
* javacc SegmentParser.jj | |
* javac -Xlint:unchecked SegmentParser.java | |
* echo " chrM:1-100,000"| java SegmentParser | |
*/ | |
import java.math.BigInteger; | |
class Segment | |
{ | |
String chrom; | |
int start; | |
int end; | |
public Segment(String chrom,int start,int end) | |
{ | |
this.chrom=chrom; | |
this.start=start; | |
this.end=end; | |
} | |
public String toString() | |
{ | |
return chrom+":"+start+"-"+end; | |
} | |
} | |
public class SegmentParser | |
{ | |
private static final BigInteger TWO=new BigInteger("2"); | |
private static final BigInteger INT_MAX=new BigInteger(String.valueOf(Integer.MAX_VALUE)); | |
public static Segment parseOne(String s) | |
{ | |
try | |
{ | |
SegmentParser parser = new SegmentParser(new java.io.StringReader(s)); | |
return parser.one(); | |
} | |
catch(ParseException err) | |
{ | |
throw new IllegalArgumentException(err); | |
} | |
} | |
public static java.util.List<Segment> parseMany(String s) | |
{ | |
try | |
{ | |
SegmentParser parser = new SegmentParser(new java.io.StringReader(s)); | |
return parser.many(); | |
} | |
catch(ParseException err) | |
{ | |
throw new IllegalArgumentException(err); | |
} | |
} | |
public static void main(String args[]) | |
{ | |
try | |
{ | |
SegmentParser parser = new SegmentParser(System.in); | |
System.out.println(parser.one()); | |
} | |
catch(Throwable err) | |
{ | |
err.printStackTrace(); | |
} | |
} | |
} | |
PARSER_END(SegmentParser) | |
SKIP:{ | |
" "|"\n" | |
} | |
TOKEN: | |
{ | |
<#COMMA: ","> | |
| <#LETTER:(["a"-"z"]|["A"-"Z"]|"_")> | |
| <#DIGIT:["0"-"9"]> | |
| <INT:<DIGIT> ( (<DIGIT>|<COMMA>)* <DIGIT>)? > | |
| <BP: "b" ("p")?> | |
| <KB: ("k") ("B")?> | |
| <MB: ("m") ("B")?> | |
| <GB: ("g") ("B")?> | |
| <IDENTIFIER: <LETTER> (<DIGIT>|<LETTER>)* > | |
| <COLON: ":" > | |
| <DASH: "-" > | |
| <PLUS: "+" > | |
| <DELIM: ("|"|";") > | |
} | |
public java.util.List<Segment> many(): | |
{ | |
java.util.List<Segment> L=new java.util.ArrayList<Segment>(); | |
Segment s1; | |
Segment s2; | |
} | |
{ | |
( (s1=segment() { L.add(s1);} ((<DELIM>)? s2=segment() { L.add(s2);} )* )? <EOF>) | |
{ | |
return L; | |
} | |
} | |
public Segment one():{Segment pos; } | |
{ | |
( pos=segment() <EOF>) | |
{ | |
return pos; | |
} | |
} | |
private Segment segment():{String chrom;BigInteger start=null; BigInteger end=null ; char sign='?';} | |
{ | |
( chrom=chromName() <COLON> start= position() (<DASH> end=position() {sign='-';}| <PLUS> end=position() {sign='+';})? ) | |
{ | |
switch(sign) | |
{ | |
case '?': end=start.add(BigInteger.ONE);break; | |
case '-': break; | |
case '+': start=start.subtract(end);end=BigInteger.ONE.add(start.add(end.multiply(TWO)));break; | |
} | |
if(start.compareTo(BigInteger.ZERO)<0) throw new ParseException(start.toString()+" < 0)"); | |
if(end.compareTo(start)<0) throw new ParseException(start.toString()+" > "+end+")"); | |
if(INT_MAX.compareTo(start)<=0) throw new ParseException(start.toString()+" > "+ INT_MAX); | |
if(INT_MAX.compareTo(end)<=0) throw new ParseException(end.toString()+" > "+ INT_MAX); | |
return new Segment(chrom,start.intValue(),end.intValue()); | |
} | |
} | |
private BigInteger position():{BigInteger i=null; BigInteger mul=BigInteger.ONE;} | |
{ | |
i=integer() (mul=factor())? { return i.multiply(mul);} | |
} | |
private BigInteger factor():{} | |
{ | |
( | |
<BP> { return new BigInteger("1");} | |
| <KB> { return new BigInteger("1000");} | |
| <MB> { return new BigInteger("1000000");} | |
| <GB> { return new BigInteger("1000000000");} | |
) | |
} | |
private BigInteger integer():{Token t;} | |
{ | |
t=<INT> { return new BigInteger(t.image.replace(",","")); } | |
} | |
private String chromName():{BigInteger i; String s;} | |
{ | |
( i=integer() { return "chr"+i.toString(); } | |
| s=identifier() { return s;} | |
) | |
} | |
private String identifier():{Token t;} | |
{ | |
t=<IDENTIFIER> { return t.image; } | |
} |
Compiling
javacc SegmentParser.jj
javac SegmentParser.java
javac SegmentParser.java
Running
echo " chrM:1-100,000"| java SegmentParser
chrM:1-100000
echo " c1:1000"| java SegmentParser
c1:1000-1001
echo "2:1Gb+1 " | java SegmentParser
chr2:999999999-1000000002
echo "chr2:10+100" | java SegmentParser
ParseException: -90 < 0)
echo "chrX:3147483647" | java SegmentParser
ParseException: 3147483647 > 2147483647 (int-max)
echo "2:1Gb+a azd " | java SegmentParser
ParseException: Encountered "a" at line 1, column 7
chrM:1-100000
echo " c1:1000"| java SegmentParser
c1:1000-1001
echo "2:1Gb+1 " | java SegmentParser
chr2:999999999-1000000002
echo "chr2:10+100" | java SegmentParser
ParseException: -90 < 0)
echo "chrX:3147483647" | java SegmentParser
ParseException: 3147483647 > 2147483647 (int-max)
echo "2:1Gb+a azd " | java SegmentParser
ParseException: Encountered "a" at line 1, column 7
That's it,
Pierre
No comments:
Post a Comment