Inserting the result of a BLAST into a Database using XSLT.
Here is the XML output of a BLAST:
<?xml version="1.0"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>tblastn</BlastOutput_program>
<BlastOutput_version>TBLASTN 2.2.27+</BlastOutput_version>
<BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer, Jinghui Z
hang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation
of protein database search programs", Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>
<BlastOutput_db>nr</BlastOutput_db>
<BlastOutput_query-ID>52385</BlastOutput_query-ID>
<BlastOutput_query-def>myseq</BlastOutput_query-def>
<BlastOutput_query-len>30</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_matrix>BLOSUM62</Parameters_matrix>
<Parameters_expect>10</Parameters_expect>
<Parameters_gap-open>11</Parameters_gap-open>
<Parameters_gap-extend>1</Parameters_gap-extend>
<Parameters_filter>L;</Parameters_filter>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>52385</Iteration_query-ID>
<Iteration_query-def>myseq</Iteration_query-def>
<Iteration_query-len>30</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>gi|110624327|dbj|AK225891.1|</Hit_id>
<Hit_def>Homo sapiens mRNA for zinc finger CCCH-type containing 7B variant, clone: FCC121C02</Hit_def>
<Hit_accession>AK225891</Hit_accession>
<Hit_len>1829</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>62.3882</Hsp_bit-score>
<Hsp_score>150</Hsp_score>
<Hsp_evalue>4.01658e-10</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>30</Hsp_query-to>
<Hsp_hit-from>250</Hsp_hit-from>
<Hsp_hit-to>339</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>1</Hsp_hit-frame>
<Hsp_identity>30</Hsp_identity>
<Hsp_positive>30</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>30</Hsp_align-len>
<Hsp_qseq>MERQKRKADIEKGLQFIQSTLPLKQEEYEA</Hsp_qseq>
<Hsp_hseq>MERQKRKADIEKGLQFIQSTLPLKQEEYEA</Hsp_hseq>
<Hsp_midline>MERQKRKADIEKGLQFIQSTLPLKQEEYEA</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>gi|6176337|gb|AF188530.1|AF188530</Hit_id>
<Hit_def>Homo sapiens ubiquitous tetratricopeptide containing protein RoXaN mRNA, partial cds</Hit_def&g
t;
<Hit_accession>AF188530</Hit_accession>
<Hit_len>2398</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>62.3882</Hsp_bit-score>
<Hsp_score>150</Hsp_score>
<Hsp_evalue>4.12279e-10</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>30</Hsp_query-to>
<Hsp_hit-from>105</Hsp_hit-from>
<Hsp_hit-to>194</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>3</Hsp_hit-frame>
<Hsp_identity>30</Hsp_identity>
<Hsp_positive>30</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>30</Hsp_align-len>
<Hsp_qseq>MERQKRKADIEKGLQFIQSTLPLKQEEYEA</Hsp_qseq>
<Hsp_hseq>MERQKRKADIEKGLQFIQSTLPLKQEEYEA</Hsp_hseq>
<Hsp_midline>MERQKRKADIEKGLQFIQSTLPLKQEEYEA</Hsp_midline>
</Hsp>
</Hit_hsps>
(...)
We want to insert that XML file into a database. I wrote the following XSLT stylesheet , it transforms the blast-xml into a set of SQL statements for sqlite3.xsltproc --novalid blast2sqlite.xsl blast.xml
create table BlastOutput(
(...)
BEGIN TRANSACTION;
insert into BlastOutput(
program,
version,
reference,
db,
query_ID,
query_def,
query_len
)
values (
'tblastn',
'TBLASTN 2.2.27+',
'Stephen F. Altschul, Thomas L. Madden, Alejandro A. Schäffer, Jinghui Zhang,Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.',
'nr',
'52385',
'myseq',
30
);
insert into Parameters(
blastOutput_id,
expect,
matrix,
sc_match,
sc_mismatch,
gap_open,
gap_extend,
filter
)
select MAX(id),
10,
'BLOSUM62',
NULL,
NULL,
11,
1,
'L;'
from BlastOutput;
insert into Iteration(
blastOutput_id,
iter_num,
query_id,
query_def,
query_len
)
select MAX(id),
1,
'52385',
'myseq',
30
from BlastOutput;
insert into Hit(iteration_id,num,hit_id,def,accession,len)
select MAX(id),
1,
'gi|110624327|dbj|AK225891.1|',
'Homo sapiens mRNA for zinc finger CCCH-type containing 7B variant, clone: FCC121C02',
'AK225891',
1829
from Iteration;
(...)
All in one you can redirect the output to sqlite3.xsltproc --novalid blast2sqlite.xsl blast.xml |\ sqlite3 input.dband query the database:
$ sqlite3 -header -line input.sqlite \
'select * from Hsp,Hit where Hsp.hit_id=Hit.id limit 2'
id = 1
hit_id = 1
num = 1
bit_score = 62.3882
score = 150.0
evalue = 4.01658e-10
query_from = 1
query_to = 30
hit_from = 250
hit_to = 339
query_frame = 0
hit_frame = 1
identity = 30
positive = 30
gaps = 0
align_len = 30
qseq = MERQKRKADIEKGLQFIQSTLPLKQEEYEA
hseq = MERQKRKADIEKGLQFIQSTLPLKQEEYEA
midline = MERQKRKADIEKGLQFIQSTLPLKQEEYEA
id = 1
iteration_id = 1
num = 1
hit_id = gi|110624327|dbj|AK225891.1|
def = Homo sapiens mRNA for zinc finger CCCH-type containing 7B variant, clone: FCC121C02
accession = AK225891
len = 1829
id = 2
hit_id = 2
num = 1
bit_score = 62.3882
score = 150.0
evalue = 4.12279e-10
query_from = 1
query_to = 30
hit_from = 105
hit_to = 194
query_frame = 0
hit_frame = 3
identity = 30
positive = 30
gaps = 0
align_len = 30
qseq = MERQKRKADIEKGLQFIQSTLPLKQEEYEA
hseq = MERQKRKADIEKGLQFIQSTLPLKQEEYEA
midline = MERQKRKADIEKGLQFIQSTLPLKQEEYEA
id = 2
iteration_id = 1
num = 2
hit_id = gi|6176337|gb|AF188530.1|AF188530
def = Homo sapiens ubiquitous tetratricopeptide containing protein RoXaN mRNA, partial cds
accession = AF188530
len = 2398That's it,Pierre
No comments:
Post a Comment