Playing with the #Ensembl REST API: filtering a VCF with javascript
The new Ensembl REST API was announced today: "We are pleased to announce the beta release of our programming language agnostic REST API, for Release 68 data, at http://beta.rest.ensembl. Our initial release provides access to:
- Sequences (genomic, cDNA, CDS and protein)
- VEP (Variant Effect Predictor)
- Homologies
- Gene Trees
- Assembly and coordinate mapping."
In the current post I will filter a VCF file with this API and javascript.
The VCF
Our initial file is the following VCF:##fileformat=VCFv4.0 ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth"> #CHROM POS ID REF ALT QUAL FILTER INFO 1 10327 rs112750067 T C . PASS DP=65 1 69427 rs148502021 T A . PASS DP=1557 1 69452 rs142004627 A G . PASS DP=155 1 69475 rs148502021 C T . PASS DP=231 1 865583 rs148711625 A G . PASS DP=231 1 866460 rs148884928 A G . PASS DP=23 1 866461 . G A . PASS DP=24
The javascript
The VCF will be read on the standard input using the following script and the Rhino JS engine. The script reads the VCF and for each substitution, it calls the Ensembl REST API, parses the JSON response and return the transcript identifier if the mutation is a missense_variant or a polyphen_prediction="probably damaging":importPackage(java.io); importPackage(java.lang); function sleep(milliseconds) { /* hacked from http://www.phpied.com/sleep-in-javascript/ */ var start = new Date().getTime(); for (var i = 0; i < 1e7; i++) { if ((new Date().getTime() - start) > milliseconds){ break; } } } function damagingTranscript(json) { for(var d in json.data) { var transcripts=json.data[d].transcripts; if(!transcripts) return null; for(var t in transcripts) { var transcript=transcripts[t]; for(var a in transcript.alleles) { var allele=transcript.alleles[a]; if(allele.polyphen_prediction=="probably damaging" || allele.consequence_terms.indexOf("missense_variant")!=-1 ) { return transcript.transcript_id; } } } } return null; } var baseRegex=new RegExp(/^[ATGCatgc]$/); var stdin = new java.io.BufferedReader( new java.io.InputStreamReader(java.lang.System['in']) ); var line; while((line=stdin.readLine())!=null) { if(line.startsWith("#")) { print(line); continue; } var tokens=line.split("\t"); var chrom=tokens[0]; var pos= parseInt(tokens[1]); var ref= tokens[3]; var alt= tokens[4]; if(!baseRegex.test(ref)) continue; if(!baseRegex.test(alt)) continue; sleep(200); var url="http://beta.rest.ensembl.org/vep/human/"+ chrom+":"+pos+"-"+pos+"/"+alt+ "/consequences?content-type=application/json"; var jsonStr=readUrl(url,"UTF-8"); var json=eval("("+jsonStr+")"); var transcript=damagingTranscript(json); if(transcript==null) continue; tokens[7]+=(";TRANSCRIPT="+transcript); print(tokens.join('\t')); }
As an example, here is the response from the ENSEMBL server for
1:866460 A/G
:http://beta.rest.ensembl.org/vep/human/1:866460-866460/G/consequences?content-type=application/json
{ "data": [ { "location": { "coord_system": "chromosome", "name": "1", "strand": 1, "end": "866460", "start": "866460" }, "hgvs": { "G": "1:g.866460C>G" }, "transcripts": [ { "translation_stable_id": "ENSP00000411579", "intron_number": null, "cdna_end": 385, "translation_end": 99, "exon_number": "4/7", "is_canonical": 0, "transcript_id": "ENST00000420190", "cdna_start": 385, "gene_id": "ENSG00000187634", "cds_start": 296, "translation_start": 99, "alleles": { "C/G": { "sift_prediction": "deleterious", "polyphen_prediction": "benign", "polyphen_score": 0.001, "display_codon_allele_string": "gCg/gGg", "hgvs_protein": "ENSP00000411579.1:p.Ala99Gly", "sift_score": 0.04, "consequence_terms": [ "missense_variant" ], "pep_allele_string": "A/G", "codon_allele_string": "GCG/GGG", "hgvs_transcript": "ENST00000420190.1:c.296C>G" } }, "name": "SAMD11", "biotype": "protein_coding", "cds_end": 296, "cdna_allele_string": "C/G", "codon_position": 2 }, { "translation_stable_id": "ENSP00000393181", "intron_number": null, "cdna_end": 356, "translation_end": 99, "exon_number": "4/5", "is_canonical": 0, "transcript_id": "ENST00000437963", "cdna_start": 356, "gene_id": "ENSG00000187634", "cds_start": 296, "translation_start": 99, "alleles": { "C/G": { "sift_prediction": "deleterious", "polyphen_prediction": "benign", "polyphen_score": 0.001, "display_codon_allele_string": "gCg/gGg", "hgvs_protein": "ENSP00000393181.1:p.Ala99Gly", "sift_score": 0.03, "consequence_terms": [ "missense_variant" ], "pep_allele_string": "A/G", "codon_allele_string": "GCG/GGG", "hgvs_transcript": "ENST00000437963.1:c.296C>G" } }, "name": "SAMD11", "biotype": "protein_coding", "cds_end": 296, "cdna_allele_string": "C/G", "codon_position": 2 }, { "translation_stable_id": "ENSP00000342313", "intron_number": null, "cdna_end": 379, "translation_end": 99, "exon_number": "4/14", "is_canonical": 1, "transcript_id": "ENST00000342066", "cdna_start": 379, "gene_id": "ENSG00000187634", "cds_start": 296, "alleles": { "C/G": { "sift_prediction": "deleterious", "polyphen_prediction": "benign", "polyphen_score": 0.001, "display_codon_allele_string": "gCg/gGg", "hgvs_protein": "ENSP00000342313.3:p.Ala99Gly", "sift_score": 0.01, "consequence_terms": [ "missense_variant" ], "pep_allele_string": "A/G", "codon_allele_string": "GCG/GGG", "hgvs_transcript": "ENST00000342066.3:c.296C>G" } }, "translation_start": 99, "name": "SAMD11", "biotype": "protein_coding", "cds_end": 296, "cdna_allele_string": "C/G", "ccds": "CCDS2.2", "codon_position": 2 }, { "translation_stable_id": "ENSP00000349216", "intron_number": null, "cdna_end": 67, "translation_end": 23, "exon_number": "2/12", "is_canonical": 0, "transcript_id": "ENST00000341065", "cdna_start": 67, "gene_id": "ENSG00000187634", "cds_start": 68, "translation_start": 23, "alleles": { "C/G": { "sift_prediction": "deleterious", "polyphen_prediction": "benign", "polyphen_score": 0.008, "display_codon_allele_string": "gCg/gGg", "hgvs_protein": "ENSP00000349216.4:p.Ala23Gly", "sift_score": 0.01, "consequence_terms": [ "missense_variant" ], "pep_allele_string": "A/G", "codon_allele_string": "GCG/GGG", "hgvs_transcript": "ENST00000341065.4:c.67C>G" } }, "name": "SAMD11", "biotype": "protein_coding", "cds_end": 68, "cdna_allele_string": "C/G", "codon_position": 2 } ] } ] }
Invoking RHINO, filtering the VCF
$ cat input.vcf | rhino -f restensembl.js ##fileformat=VCFv4.0 ##INFO=#CHROM POS ID REF ALT QUAL FILTER INFO 1 69427 rs148502021 T A . PASS DP=1557;TRANSCRIPT=ENST00000335137 1 69452 rs142004627 A G . PASS DP=155;TRANSCRIPT=ENST00000335137 1 69475 rs148502021 C T . PASS DP=231;TRANSCRIPT=ENST00000335137 1 865583 rs148711625 A G . PASS DP=231;TRANSCRIPT=ENST00000420190 1 866460 rs148884928 A G . PASS DP=23;TRANSCRIPT=ENST00000420190
Limitations for the Ensembl REST API:
@yokofakun how many requests have you done? We rate limit to 3 per second averaged over the hour and allow spikes of 6 per second
— andrewyatz (@andrewyatz) September 27, 2012
@yokofakun lol I like your style :). If you do 3 predictions per second you'll be fine :)
— andrewyatz (@andrewyatz) September 27, 2012
That's it,
Pierre