finding new intron-exon junctions using the public Encode RNASeq data
I've been asked to look for some new / suspected / previously uncharacterized intron-exon junctions in public RNASeq data.
I've used the BAMs under http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeCaltechRnaSeq/.
The following command is used to build the list of BAMs:
curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeCaltechRnaSeq/" |\ tr ' <>"' "\n" | grep -F .bam | grep -v bai | sort | uniq | sed 's/.bam$//' | sed 's/$/ \\/' wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep1V2 \ wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2 \ wgEncodeCaltechRnaSeqGm12878R1x75dSplicesRep1V2 \ wgEncodeCaltechRnaSeqGm12878R1x75dSplicesRep2V2 \ wgEncodeCaltechRnaSeqGm12878R2x75Il200AlignsRep1V2 \ wgEncodeCaltechRnaSeqGm12878R2x75Il200AlignsRep2V2 \ wgEncodeCaltechRnaSeqGm12878R2x75Il200SplicesRep1V2 \ wgEncodeCaltechRnaSeqGm12878R2x75Il200SplicesRep2V2 \ wgEncodeCaltechRnaSeqGm12878R2x75Il400AlignsRep2V2 \ wgEncodeCaltechRnaSeqGm12878R2x75Il400SplicesRep2V2 \ (...)
This list is inserted as a list named SAMPLES a Makefile.
For each BAM, we use samtools to retrieve the reads in the region(s)) of interest. The reads are then filtered with samjs (https://github.com/lindenb/jvarkit/wiki/SamJS) to only keep the reads carrying an intron-exon junction at the desired location(s). Basically, the javascript-based filter loops over the CIGAR string of the read, computes the genomic interval skipped when the cigar operator is a deletion or a skipped region/intron. The read is printed if it describes the new intron-exon junction.
All in one:
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
SAMPLES=\ | |
wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqGm12878R1x75dAlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqGm12878R1x75dSplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqGm12878R1x75dSplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqGm12878R2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqGm12878R2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqGm12878R2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqGm12878R2x75Il200SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqGm12878R2x75Il400AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqGm12878R2x75Il400SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqGm12891R2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqGm12891R2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqGm12891R2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqGm12891R2x75Il200SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqGm12892R2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqGm12892R2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqGm12892R2x75Il200AlignsRep3V2 \ | |
wgEncodeCaltechRnaSeqGm12892R2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqGm12892R2x75Il200SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqGm12892R2x75Il200SplicesRep3V2 \ | |
wgEncodeCaltechRnaSeqH1hescR1x75dAlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqH1hescR1x75dAlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqH1hescR1x75dSplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqH1hescR1x75dSplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqH1hescR2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqH1hescR2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqH1hescR2x75Il200AlignsRep3V2 \ | |
wgEncodeCaltechRnaSeqH1hescR2x75Il200AlignsRep4V2 \ | |
wgEncodeCaltechRnaSeqH1hescR2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqH1hescR2x75Il200SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqH1hescR2x75Il200SplicesRep3V2 \ | |
wgEncodeCaltechRnaSeqH1hescR2x75Il200SplicesRep4V2 \ | |
wgEncodeCaltechRnaSeqH1hescR2x75Il400AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqH1hescR2x75Il400SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqHct116R2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqHct116R2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqHct116R2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqHct116R2x75Il200SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqHelas3R1x75dAlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqHelas3R1x75dAlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqHelas3R1x75dSplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqHelas3R1x75dSplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqHelas3R2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqHelas3R2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqHelas3R2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqHelas3R2x75Il200SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqHepg2R1x75dAlignsRep1V3 \ | |
wgEncodeCaltechRnaSeqHepg2R1x75dAlignsRep2V3 \ | |
wgEncodeCaltechRnaSeqHepg2R1x75dSplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqHepg2R1x75dSplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqHepg2R2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqHepg2R2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqHepg2R2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqHepg2R2x75Il200SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqHsmmR2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqHsmmR2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqHsmmR2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqHsmmR2x75Il200SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqHuvecR1x75dAlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqHuvecR1x75dAlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqHuvecR1x75dSplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqHuvecR1x75dSplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqHuvecR2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqHuvecR2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqHuvecR2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqHuvecR2x75Il200SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqK562R1x75dAlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqK562R1x75dAlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqK562R1x75dSplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqK562R1x75dSplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqK562R2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqK562R2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqK562R2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqK562R2x75Il200SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqLhcnm2R2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqLhcnm2R2x75Il200Myc7dAlignsRep1 \ | |
wgEncodeCaltechRnaSeqLhcnm2R2x75Il200Myc7dSplicesRep1 \ | |
wgEncodeCaltechRnaSeqLhcnm2R2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqMcf7R2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqMcf7R2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqMcf7R2x75Il200AlignsRep3V2 \ | |
wgEncodeCaltechRnaSeqMcf7R2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqMcf7R2x75Il200SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqMcf7R2x75Il200SplicesRep3V2 \ | |
wgEncodeCaltechRnaSeqNhekR1x75dAlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqNhekR1x75dSplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqNhekR2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqNhekR2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqNhekR2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqNhekR2x75Il200SplicesRep2V2 \ | |
wgEncodeCaltechRnaSeqNhlfR2x75Il200AlignsRep1V2 \ | |
wgEncodeCaltechRnaSeqNhlfR2x75Il200AlignsRep2V2 \ | |
wgEncodeCaltechRnaSeqNhlfR2x75Il200SplicesRep1V2 \ | |
wgEncodeCaltechRnaSeqNhlfR2x75Il200SplicesRep2V2 | |
define run | |
$(1).bam : splice.js | |
${HOME}/package/samtools-0.1.19/samtools view -b "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeCaltechRnaSeq/$(1).bam" "chrABCD:12345-77778" |\ | |
java -jar ~/src/jvarkit-git/dist/samjs.jar -f splice.js |\ | |
${HOME}/package/samtools-0.1.19/samtools view -Sbu - |\ | |
${HOME}/package/samtools-0.1.19/samtools sort - $(1) | |
${HOME}/package/samtools-0.1.19/samtools view -c $$@ | |
${HOME}/package/samtools-0.1.19/samtools index $$@ | |
endef | |
all: $(addsuffix .bam,${SAMPLES}) | |
rm -f jeter.zip | |
zip jeter.zip $^ $(addsuffix .bam.bai,${SAMPLES}) | |
$(foreach S,${SAMPLES},$(eval $(call run,${S}))) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function accept(r) { | |
/* skip unmapped, secondary, supplementary reads or bad reference name */ | |
if(r.getReadUnmappedFlag() || r.isSecondaryOrSupplementary() || r.getReferenceName()!="chrABCD" ) return false; | |
var cigar = r.getCigar(); | |
/* skip read if cigar is missing or if the is only one cigar element */ | |
if(cigar==null || cigar.numCigarElements()<2) return false; | |
var ref= r.getAlignmentStart(); | |
var i; | |
/* loop over cigar elements */ | |
for(i=0;i< cigar.numCigarElements();++i) | |
{ | |
var c= cigar.getCigarElement(i); | |
var op = c.getOperator(); | |
/* end of cigar alignement */ | |
var r_next = ref; | |
if(op.consumesReferenceBases()) { | |
r_next += c.getLength(); | |
} | |
/* is it a deletion or an skipped intron ? */ | |
if(op.name()=="D" || op.name()=="N") { | |
if(ref==12345 && r_next==56789 ) return true; | |
if(ref==66666 && r_next==77777 ) return true; | |
} | |
ref=r_next; | |
} | |
return false; | |
} | |
accept(record); |
That's it,
Pierre
No comments:
Post a Comment