But an xslt stylesheet can be applied with xsltstream to all the <Rs> elements of 'ds_ch1.xml.gz':
java -jar xsltstream.jar -x 'http://lindenb.googlecode.com/svn/trunk/src/xsl/dbsnp2rdf.xsl' -q Rs \
'ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/XML/ds_ch1.xml.gz' |\
grep -v "rdf:RDF" | grep -v "<?xml version"
(...)
<o:SNP rdf:about="http://www.ncbi.nlm.nih.gov/snp/830">
<dc:title>rs830</dc:title>
<o:taxon rdf:resource="http://www.ncbi.nlm.nih.gov/taxonomy/9606"/>
<o:het rdf:datatype="http://www.w3.org/2001/XMLSchema#float">0.02</o:het>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:WIAF"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:SNP500CANCER"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:SEQUENOM"/>
<o:hasMapping>
<o:Mapping>
<o:build rdf:resource="urn:void:ncbi:build:Celera/36_3"/>
<o:chrom rdf:resource="urn:void:ncbi:chromosome:9606/chr1"/>
<o:start rdf:datatype="http://www.w3.org/2001/XMLSchema#int">66444409</o:start>
<o:end rdf:datatype="http://www.w3.org/2001/XMLSchema#int">66444410</o:end>
<o:orient>+</o:orient>
</o:Mapping>
</o:hasMapping>
<o:hasMapping>
<o:Mapping>
<o:build rdf:resource="urn:void:ncbi:build:HuRef/36_3"/>
<o:chrom rdf:resource="urn:void:ncbi:chromosome:9606/chr1"/>
<o:start rdf:datatype="http://www.w3.org/2001/XMLSchema#int">66263806</o:start>
<o:end rdf:datatype="http://www.w3.org/2001/XMLSchema#int">66263807</o:end>
<o:orient>-</o:orient>
</o:Mapping>
</o:hasMapping>
<o:hasMapping>
<o:Mapping>
<o:build rdf:resource="urn:void:ncbi:build:reference/36_3"/>
<o:chrom rdf:resource="urn:void:ncbi:chromosome:9606/chr1"/>
<o:start rdf:datatype="http://www.w3.org/2001/XMLSchema#int">67926134</o:start>
<o:end rdf:datatype="http://www.w3.org/2001/XMLSchema#int">67926135</o:end>
<o:orient>+</o:orient>
</o:Mapping>
</o:hasMapping>
</o:SNP>
<o:SNP rdf:about="http://www.ncbi.nlm.nih.gov/snp/844">
<dc:title>rs844</dc:title>
<o:taxon rdf:resource="http://www.ncbi.nlm.nih.gov/taxonomy/9606"/>
<o:het rdf:datatype="http://www.w3.org/2001/XMLSchema#float">0.42</o:het>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:WIAF"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:LEE"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:HGBASE"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:SC_JCM"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:TSC-CSHL"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:LEE"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:YUSUKE"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:CGAP-GAI"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:CSHL-HAPMAP"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:PERLEGEN"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:ABI"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:SI_EXO"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:BCMHGSC_JDW"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:HUMANGENOME_JCVI"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:SNP500CANCER"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:1000GENOMES"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:ILLUMINA-UK"/>
<o:hasMapping>
<o:Mapping>
<o:build rdf:resource="urn:void:ncbi:build:Celera/36_3"/>
<o:chrom rdf:resource="urn:void:ncbi:chromosome:9606/chr1"/>
<o:start rdf:datatype="http://www.w3.org/2001/XMLSchema#int">134750981</o:start>
<o:end rdf:datatype="http://www.w3.org/2001/XMLSchema#int">134750982</o:end>
<o:orient>+</o:orient>
</o:Mapping>
</o:hasMapping>
<o:hasMapping>
<o:Mapping>
<o:build rdf:resource="urn:void:ncbi:build:HuRef/36_3"/>
<o:chrom rdf:resource="urn:void:ncbi:chromosome:9606/chr1"/>
<o:start rdf:datatype="http://www.w3.org/2001/XMLSchema#int">132892081</o:start>
<o:end rdf:datatype="http://www.w3.org/2001/XMLSchema#int">132892082</o:end>
<o:orient>-</o:orient>
(...)
'ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/XML/ds_ch1.xml.gz' |\
grep -v "rdf:RDF" | grep -v "<?xml version"
(...)
<o:SNP rdf:about="http://www.ncbi.nlm.nih.gov/snp/830">
<dc:title>rs830</dc:title>
<o:taxon rdf:resource="http://www.ncbi.nlm.nih.gov/taxonomy/9606"/>
<o:het rdf:datatype="http://www.w3.org/2001/XMLSchema#float">0.02</o:het>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:WIAF"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:SNP500CANCER"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:SEQUENOM"/>
<o:hasMapping>
<o:Mapping>
<o:build rdf:resource="urn:void:ncbi:build:Celera/36_3"/>
<o:chrom rdf:resource="urn:void:ncbi:chromosome:9606/chr1"/>
<o:start rdf:datatype="http://www.w3.org/2001/XMLSchema#int">66444409</o:start>
<o:end rdf:datatype="http://www.w3.org/2001/XMLSchema#int">66444410</o:end>
<o:orient>+</o:orient>
</o:Mapping>
</o:hasMapping>
<o:hasMapping>
<o:Mapping>
<o:build rdf:resource="urn:void:ncbi:build:HuRef/36_3"/>
<o:chrom rdf:resource="urn:void:ncbi:chromosome:9606/chr1"/>
<o:start rdf:datatype="http://www.w3.org/2001/XMLSchema#int">66263806</o:start>
<o:end rdf:datatype="http://www.w3.org/2001/XMLSchema#int">66263807</o:end>
<o:orient>-</o:orient>
</o:Mapping>
</o:hasMapping>
<o:hasMapping>
<o:Mapping>
<o:build rdf:resource="urn:void:ncbi:build:reference/36_3"/>
<o:chrom rdf:resource="urn:void:ncbi:chromosome:9606/chr1"/>
<o:start rdf:datatype="http://www.w3.org/2001/XMLSchema#int">67926134</o:start>
<o:end rdf:datatype="http://www.w3.org/2001/XMLSchema#int">67926135</o:end>
<o:orient>+</o:orient>
</o:Mapping>
</o:hasMapping>
</o:SNP>
<o:SNP rdf:about="http://www.ncbi.nlm.nih.gov/snp/844">
<dc:title>rs844</dc:title>
<o:taxon rdf:resource="http://www.ncbi.nlm.nih.gov/taxonomy/9606"/>
<o:het rdf:datatype="http://www.w3.org/2001/XMLSchema#float">0.42</o:het>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:WIAF"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:LEE"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:HGBASE"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:SC_JCM"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:TSC-CSHL"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:LEE"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:YUSUKE"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:CGAP-GAI"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:CSHL-HAPMAP"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:PERLEGEN"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:ABI"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:SI_EXO"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:BCMHGSC_JDW"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:HUMANGENOME_JCVI"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:SNP500CANCER"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:1000GENOMES"/>
<o:hasHandle rdf:resource="urn:void:ncbi:snp:handle:ILLUMINA-UK"/>
<o:hasMapping>
<o:Mapping>
<o:build rdf:resource="urn:void:ncbi:build:Celera/36_3"/>
<o:chrom rdf:resource="urn:void:ncbi:chromosome:9606/chr1"/>
<o:start rdf:datatype="http://www.w3.org/2001/XMLSchema#int">134750981</o:start>
<o:end rdf:datatype="http://www.w3.org/2001/XMLSchema#int">134750982</o:end>
<o:orient>+</o:orient>
</o:Mapping>
</o:hasMapping>
<o:hasMapping>
<o:Mapping>
<o:build rdf:resource="urn:void:ncbi:build:HuRef/36_3"/>
<o:chrom rdf:resource="urn:void:ncbi:chromosome:9606/chr1"/>
<o:start rdf:datatype="http://www.w3.org/2001/XMLSchema#int">132892081</o:start>
<o:end rdf:datatype="http://www.w3.org/2001/XMLSchema#int">132892082</o:end>
<o:orient>-</o:orient>
(...)
The java archive for xsltstream is available at http://lindenb.googlecode.com/files/xsltstream.jar
Usage:
-x <xslt-stylesheet file/url> required
-p <param-name> <param-value> (add parameter to the xslt engine)
-d depth (0 based) default:-1
-q qName target default:null
<file>|stdin
That's it !
Pierre
There's a lot of capability for streamed processing in Saxon-EE these days, which goes well beyond this - though I agree there are many cases which can be solved using this kind of approach.
ReplyDeleteThanks Michael, it is nice to know
ReplyDeleteHi Pierre,
ReplyDeleteDigging through your code I notice that it interprets the SAX events to build the DOM. I would guess that you have done this because there is no obvious event mechanism available as a DOM is built normally. However there is one - see Document Object Model Load and Save
Below is my rough code to transform nodes in a stream.
Reg
public class Transform {
public static void main(String[] args) throws Exception {
final String targetNodeName = args[0];
String xsltFilename = args[1];
InputStream inputStream = System.in;
OutputStream outputStream = System.out;
/*
* Get hold of factory for LoadSave objects See
* http://www.w3.org/TR/DOM-Level-3-LS/load-save.html
*/
DOMImplementationRegistry registry = DOMImplementationRegistry
.newInstance();
DOMImplementationLS domImpl = (DOMImplementationLS) registry
.getDOMImplementation("XML 1.0 LS 3.0");
/*
* Read up XSLT transformation. Each thread will need to create its own
* copy of the Transformer.
*/
TransformerFactory transformerFactory = TransformerFactory
.newInstance();
Templates templates = transformerFactory.newTemplates(new StreamSource(
new BufferedReader(new FileReader(xsltFilename))));
final Transformer transformer = templates.newTransformer();
transformer.setOutputProperty("omit-xml-declaration", "yes");
/*
* Create DOM filter that rejects all nodes outside of target node, and
* even rejects the target node once transformed.
*/
final Result result = new StreamResult(outputStream);
LSParserFilter filter = new LSParserFilter() {
boolean withinTargetNode = false;
int indexInStream = 0;
public short acceptNode(Node node) {
if (Node.ELEMENT_NODE == node.getNodeType()
&& targetNodeName.equals(node.getNodeName())) {
/*
* We have a completed target node, we transform it to the
* output stream.
*/
indexInStream++;
transformer.setParameter("nodeIndex", indexInStream);
DOMSource domSource = new DOMSource(node);
try {
transformer.transform(domSource, result);
} catch (Exception ex) {
System.err.println(ex);
return FILTER_INTERRUPT;
}
withinTargetNode = false;
return FILTER_REJECT;
}
return withinTargetNode ? FILTER_ACCEPT : FILTER_REJECT;
}
public int getWhatToShow() {
return NodeFilter.SHOW_ALL;
}
public short startElement(Element element) {
if (targetNodeName.equals(element.getNodeName())) {
withinTargetNode = true;
}
return FILTER_ACCEPT;
}
};
/*
* Create a parser which uses the filter, and parse the input stream. We
* ignore the resulting document which contains only the root node.
*/
LSParser parser = domImpl.createLSParser(
DOMImplementationLS.MODE_SYNCHRONOUS, null);
parser.setFilter(filter);
LSInput input = domImpl.createLSInput();
input.setByteStream(inputStream);
parser.parse(input);
}
}