Inserting the result of a BLAST into a Database using XSLT.
Here is the XML output of a BLAST:
<?xml version="1.0"?> <!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd"> <BlastOutput> <BlastOutput_program>tblastn</BlastOutput_program> <BlastOutput_version>TBLASTN 2.2.27+</BlastOutput_version> <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer, Jinghui Z hang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference> <BlastOutput_db>nr</BlastOutput_db> <BlastOutput_query-ID>52385</BlastOutput_query-ID> <BlastOutput_query-def>myseq</BlastOutput_query-def> <BlastOutput_query-len>30</BlastOutput_query-len> <BlastOutput_param> <Parameters> <Parameters_matrix>BLOSUM62</Parameters_matrix> <Parameters_expect>10</Parameters_expect> <Parameters_gap-open>11</Parameters_gap-open> <Parameters_gap-extend>1</Parameters_gap-extend> <Parameters_filter>L;</Parameters_filter> </Parameters> </BlastOutput_param> <BlastOutput_iterations> <Iteration> <Iteration_iter-num>1</Iteration_iter-num> <Iteration_query-ID>52385</Iteration_query-ID> <Iteration_query-def>myseq</Iteration_query-def> <Iteration_query-len>30</Iteration_query-len> <Iteration_hits> <Hit> <Hit_num>1</Hit_num> <Hit_id>gi|110624327|dbj|AK225891.1|</Hit_id> <Hit_def>Homo sapiens mRNA for zinc finger CCCH-type containing 7B variant, clone: FCC121C02</Hit_def> <Hit_accession>AK225891</Hit_accession> <Hit_len>1829</Hit_len> <Hit_hsps> <Hsp> <Hsp_num>1</Hsp_num> <Hsp_bit-score>62.3882</Hsp_bit-score> <Hsp_score>150</Hsp_score> <Hsp_evalue>4.01658e-10</Hsp_evalue> <Hsp_query-from>1</Hsp_query-from> <Hsp_query-to>30</Hsp_query-to> <Hsp_hit-from>250</Hsp_hit-from> <Hsp_hit-to>339</Hsp_hit-to> <Hsp_query-frame>0</Hsp_query-frame> <Hsp_hit-frame>1</Hsp_hit-frame> <Hsp_identity>30</Hsp_identity> <Hsp_positive>30</Hsp_positive> <Hsp_gaps>0</Hsp_gaps> <Hsp_align-len>30</Hsp_align-len> <Hsp_qseq>MERQKRKADIEKGLQFIQSTLPLKQEEYEA</Hsp_qseq> <Hsp_hseq>MERQKRKADIEKGLQFIQSTLPLKQEEYEA</Hsp_hseq> <Hsp_midline>MERQKRKADIEKGLQFIQSTLPLKQEEYEA</Hsp_midline> </Hsp> </Hit_hsps> </Hit> <Hit> <Hit_num>2</Hit_num> <Hit_id>gi|6176337|gb|AF188530.1|AF188530</Hit_id> <Hit_def>Homo sapiens ubiquitous tetratricopeptide containing protein RoXaN mRNA, partial cds</Hit_def&g t; <Hit_accession>AF188530</Hit_accession> <Hit_len>2398</Hit_len> <Hit_hsps> <Hsp> <Hsp_num>1</Hsp_num> <Hsp_bit-score>62.3882</Hsp_bit-score> <Hsp_score>150</Hsp_score> <Hsp_evalue>4.12279e-10</Hsp_evalue> <Hsp_query-from>1</Hsp_query-from> <Hsp_query-to>30</Hsp_query-to> <Hsp_hit-from>105</Hsp_hit-from> <Hsp_hit-to>194</Hsp_hit-to> <Hsp_query-frame>0</Hsp_query-frame> <Hsp_hit-frame>3</Hsp_hit-frame> <Hsp_identity>30</Hsp_identity> <Hsp_positive>30</Hsp_positive> <Hsp_gaps>0</Hsp_gaps> <Hsp_align-len>30</Hsp_align-len> <Hsp_qseq>MERQKRKADIEKGLQFIQSTLPLKQEEYEA</Hsp_qseq> <Hsp_hseq>MERQKRKADIEKGLQFIQSTLPLKQEEYEA</Hsp_hseq> <Hsp_midline>MERQKRKADIEKGLQFIQSTLPLKQEEYEA</Hsp_midline> </Hsp> </Hit_hsps> (...)We want to insert that XML file into a database. I wrote the following XSLT stylesheet , it transforms the blast-xml into a set of SQL statements for sqlite3.
xsltproc --novalid blast2sqlite.xsl blast.xml
create table BlastOutput(
(...)
BEGIN TRANSACTION;
insert into BlastOutput(
program,
version,
reference,
db,
query_ID,
query_def,
query_len
)
values (
'tblastn',
'TBLASTN 2.2.27+',
'Stephen F. Altschul, Thomas L. Madden, Alejandro A. Schäffer, Jinghui Zhang,Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.',
'nr',
'52385',
'myseq',
30
);
insert into Parameters(
blastOutput_id,
expect,
matrix,
sc_match,
sc_mismatch,
gap_open,
gap_extend,
filter
)
select MAX(id),
10,
'BLOSUM62',
NULL,
NULL,
11,
1,
'L;'
from BlastOutput;
insert into Iteration(
blastOutput_id,
iter_num,
query_id,
query_def,
query_len
)
select MAX(id),
1,
'52385',
'myseq',
30
from BlastOutput;
insert into Hit(iteration_id,num,hit_id,def,accession,len)
select MAX(id),
1,
'gi|110624327|dbj|AK225891.1|',
'Homo sapiens mRNA for zinc finger CCCH-type containing 7B variant, clone: FCC121C02',
'AK225891',
1829
from Iteration;
(...)
All in one you can redirect the output to sqlite3.xsltproc --novalid blast2sqlite.xsl blast.xml |\ sqlite3 input.dband query the database:
$ sqlite3 -header -line input.sqlite \
'select * from Hsp,Hit where Hsp.hit_id=Hit.id limit 2'
id = 1
hit_id = 1
num = 1
bit_score = 62.3882
score = 150.0
evalue = 4.01658e-10
query_from = 1
query_to = 30
hit_from = 250
hit_to = 339
query_frame = 0
hit_frame = 1
identity = 30
positive = 30
gaps = 0
align_len = 30
qseq = MERQKRKADIEKGLQFIQSTLPLKQEEYEA
hseq = MERQKRKADIEKGLQFIQSTLPLKQEEYEA
midline = MERQKRKADIEKGLQFIQSTLPLKQEEYEA
id = 1
iteration_id = 1
num = 1
hit_id = gi|110624327|dbj|AK225891.1|
def = Homo sapiens mRNA for zinc finger CCCH-type containing 7B variant, clone: FCC121C02
accession = AK225891
len = 1829
id = 2
hit_id = 2
num = 1
bit_score = 62.3882
score = 150.0
evalue = 4.12279e-10
query_from = 1
query_to = 30
hit_from = 105
hit_to = 194
query_frame = 0
hit_frame = 3
identity = 30
positive = 30
gaps = 0
align_len = 30
qseq = MERQKRKADIEKGLQFIQSTLPLKQEEYEA
hseq = MERQKRKADIEKGLQFIQSTLPLKQEEYEA
midline = MERQKRKADIEKGLQFIQSTLPLKQEEYEA
id = 2
iteration_id = 1
num = 2
hit_id = gi|6176337|gb|AF188530.1|AF188530
def = Homo sapiens ubiquitous tetratricopeptide containing protein RoXaN mRNA, partial cds
accession = AF188530
len = 2398
That's it,Pierre
No comments:
Post a Comment