|
/** |
|
* Author: Pierre Lindenbaum PhD |
|
* plindenbaum@yahoo.fr |
|
* Date: 2012-11 |
|
* Motivation: RDFGraph from openoffice calc files |
|
* |
|
*/ |
|
package oocalc; |
|
|
|
import java.io.File; |
|
import java.io.FileNotFoundException; |
|
import java.io.FileReader; |
|
import java.io.InputStream; |
|
import java.util.ArrayList; |
|
import java.util.LinkedList; |
|
import java.util.List; |
|
import java.util.zip.ZipEntry; |
|
import java.util.zip.ZipFile; |
|
|
|
import javax.xml.namespace.QName; |
|
import javax.xml.stream.XMLEventReader; |
|
import javax.xml.stream.XMLInputFactory; |
|
import javax.xml.stream.XMLStreamException; |
|
import javax.xml.stream.events.Attribute; |
|
import javax.xml.stream.events.EndElement; |
|
import javax.xml.stream.events.StartElement; |
|
import javax.xml.stream.events.XMLEvent; |
|
import com.hp.hpl.jena.assembler.assemblers.AssemblerBase; |
|
import com.hp.hpl.jena.assembler.Assembler; |
|
import com.hp.hpl.jena.sparql.core.assembler.AssemblerUtils; |
|
import com.hp.hpl.jena.assembler.Mode; |
|
import com.hp.hpl.jena.datatypes.RDFDatatype; |
|
import com.hp.hpl.jena.datatypes.xsd.XSDDatatype; |
|
import com.hp.hpl.jena.rdf.model.Resource; |
|
import com.hp.hpl.jena.rdf.model.Statement; |
|
import com.hp.hpl.jena.rdf.model.StmtIterator; |
|
import com.hp.hpl.jena.rdf.model.Property; |
|
import com.hp.hpl.jena.graph.Node; |
|
import com.hp.hpl.jena.graph.Triple; |
|
import com.hp.hpl.jena.graph.TripleMatch; |
|
import com.hp.hpl.jena.graph.TripleMatchIterator; |
|
import com.hp.hpl.jena.graph.impl.GraphBase; |
|
import com.hp.hpl.jena.rdf.model.AnonId; |
|
import com.hp.hpl.jena.rdf.model.ResourceFactory; |
|
import com.hp.hpl.jena.rdf.model.impl.ModelCom; |
|
import com.hp.hpl.jena.util.iterator.ExtendedIterator; |
|
import com.hp.hpl.jena.util.iterator.NiceIterator; |
|
import com.hp.hpl.jena.sparql.core.DatasetImpl; |
|
import com.hp.hpl.jena.vocabulary.DC; |
|
import com.hp.hpl.jena.vocabulary.RDF; |
|
import com.hp.hpl.jena.vocabulary.XSD; |
|
import com.hp.hpl.jena.query.Dataset; |
|
import org.slf4j.LoggerFactory; |
|
import com.hp.hpl.jena.query.*; |
|
|
|
/** |
|
* implementation of a RDF Graph for OpenOffice calc |
|
* |
|
*/ |
|
|
|
public class OpenOfficeCalcGraph |
|
extends GraphBase |
|
{ |
|
/** logger */ |
|
protected static final org.slf4j.Logger LOG= LoggerFactory.getLogger("ooffice2rdf"); |
|
/** namespaces */ |
|
private static final String OFFICE="urn:oasis:names:tc:opendocument:xmlns:office:1.0"; |
|
private static final String TABLE="urn:oasis:names:tc:opendocument:xmlns:table:1.0"; |
|
private static final String TEXT="urn:oasis:names:tc:opendocument:xmlns:text:1.0"; |
|
private static final String NS="http://rdf.lindenb.org/"; |
|
/** attributes */ |
|
private static final QName number_columns_repeated=new QName(TABLE,"number-columns-repeated","table"); |
|
private static final QName number_rows_repeated=new QName(TABLE,"number-rows-repeated","table"); |
|
private static final QName value_type=new QName(OFFICE,"value-type","office"); |
|
private static final QName value=new QName(OFFICE,"value","office"); |
|
private static final QName name=new QName(TABLE,"name","table"); |
|
//rdf:type Node |
|
private static final Node rdfType=Node.createURI(RDF.type.getURI()); |
|
//all open office files |
|
private List<File> caclFiles=null; |
|
|
|
|
|
|
|
/** static Assembler for OpenOfficeCalcGraph |
|
* An assembler creates a Dataset(graph) from a RDF-based configuration file. |
|
* It is called by Fuseki |
|
*/ |
|
public static OpenOfficeAssembler assembler = new OpenOfficeAssembler(); |
|
|
|
|
|
public static class OpenOfficeAssembler extends AssemblerBase implements Assembler |
|
{ |
|
@Override |
|
public Object open( Assembler a, Resource root, Mode mode ) |
|
{ |
|
//read the configuration an get the files |
|
List<File> files=new ArrayList<File>(); |
|
StmtIterator iter=root.listProperties(fileRsrc); |
|
while(iter.hasNext()) |
|
{ |
|
Statement stmt=iter.nextStatement(); |
|
if(!stmt.getObject().isLiteral()) throw new RuntimeException("Not a literal "+stmt); |
|
String lit=stmt.getString(); |
|
File file=new File(lit); |
|
if(!file.exists()) throw new RuntimeException("File not found : "+file); |
|
if(!file.getName().endsWith(".ods")) throw new RuntimeException("Not an .ods file : "+file); |
|
files.add(file); |
|
} |
|
iter.close(); |
|
OpenOfficeCalcGraph g=new OpenOfficeCalcGraph(files); |
|
OpenOfficeCalcModel m=new OpenOfficeCalcModel(g); |
|
Dataset ds=new DatasetImpl(m); |
|
return ds; |
|
} |
|
} |
|
|
|
/** Initializer for FUZEKI */ |
|
private static boolean init_called = false ; |
|
private static final Resource buildRsrc=ResourceFactory.createResource(NS+"build"); |
|
private static final Property fileRsrc=ResourceFactory.createProperty(NS+"file"); |
|
|
|
/** static initializer, when this class is invoked, |
|
* it tells Fuzeki that there is another assembler using Assembler.general |
|
* the resource-name for this assembler is this.buildRsrc |
|
*/ |
|
static { init() ; } |
|
private static void init() |
|
{ |
|
if(init_called) return; |
|
LOG.info("Calling OpenOfficeCalcGraph init"); |
|
AssemblerUtils.init(); |
|
Assembler.general.implementWith(buildRsrc,assembler); |
|
|
|
init_called=true; |
|
} |
|
|
|
|
|
/** RDF Model for OpenOfficeCalcGraph */ |
|
public static class OpenOfficeCalcModel extends ModelCom |
|
{ |
|
public OpenOfficeCalcModel(OpenOfficeCalcGraph g) |
|
{ |
|
super(g); |
|
} |
|
} |
|
/* one row in the spredsheet */ |
|
private static class Row |
|
{ |
|
int repeat=1; |
|
private List<Cell> cells=new ArrayList<Cell>(); |
|
} |
|
|
|
/* one cell in the spredsheet */ |
|
private static class Cell |
|
{ |
|
int repeat=1; |
|
String type=null; |
|
String value=null; |
|
String literal=null; |
|
} |
|
|
|
/** Constructor from an array of OO files */ |
|
public OpenOfficeCalcGraph(List<File> calcFiles) |
|
{ |
|
this.caclFiles=new ArrayList<File>(calcFiles); |
|
this.getPrefixMapping().setNsPrefix("office", NS); |
|
this.getPrefixMapping().setNsPrefix("xsd", XSD.getURI()); |
|
this.getPrefixMapping().setNsPrefix("dc", DC.getURI()); |
|
} |
|
|
|
|
|
@Override |
|
protected ExtendedIterator<Triple> graphBaseFind(TripleMatch matcher) |
|
{ |
|
return new TripleMatchIterator((Triple)matcher, new CellIterator()); |
|
} |
|
|
|
/** parse the openoffice files and get the Triples */ |
|
private class CellIterator extends NiceIterator<Triple> |
|
{ |
|
/** current index in array of OO files */ |
|
private int fileIndex=-1; |
|
/** buffer of triples */ |
|
private List<Triple> buffer=new LinkedList<Triple>(); |
|
/** next triple to be returned */ |
|
private Triple next=null; |
|
/** was hasNext() called ? */ |
|
private boolean hasNextCalled=false; |
|
/** current OO file opened */ |
|
private File ioFile=null; |
|
/** Zip Handler for OO file */ |
|
private ZipFile zipFile=null; |
|
/** Input Stream for current Zip entry */ |
|
private InputStream zipInputStream; |
|
/** xml-handler for current zip entry */ |
|
private XMLEventReader xmlEventReader; |
|
/* rdf subject for file */ |
|
private Node fileRsrc=null; |
|
/* rdf subject for tab */ |
|
private Node tabRsrc=null; |
|
/** current tab index */ |
|
private int tabIndex=0; |
|
/* current colun */ |
|
private int X=0; |
|
/** current row */ |
|
private int Y=0; |
|
|
|
private void add(Node s,Node p,Node o) |
|
{ |
|
this.buffer.add(Triple.create(s, p, o)); |
|
} |
|
|
|
public CellIterator() |
|
{ |
|
|
|
} |
|
|
|
private boolean isA(XMLEvent evt,String ns,String localName) |
|
{ |
|
QName q=null; |
|
|
|
if(evt.isStartElement()) |
|
{ |
|
q=evt.asStartElement().getName(); |
|
} |
|
else if(evt.isEndElement()) |
|
{ |
|
q=evt.asEndElement().getName(); |
|
} |
|
return q!=null && |
|
q.getNamespaceURI().equals(ns) && |
|
q.getLocalPart().equals(localName) |
|
; |
|
} |
|
|
|
@Override |
|
public boolean hasNext() |
|
{ |
|
if(!hasNextCalled) |
|
{ |
|
hasNextCalled=true; |
|
next=null; |
|
for(;;) |
|
{ |
|
if(!buffer.isEmpty()) |
|
{ |
|
next=buffer.remove(0); |
|
break; |
|
} |
|
|
|
try |
|
{ |
|
|
|
if(xmlEventReader==null) |
|
{ |
|
//open next file |
|
if(fileIndex+1>=OpenOfficeCalcGraph.this.caclFiles.size()) break; |
|
this.fileIndex++; |
|
this.tabIndex=0; |
|
//open XML StaX reader for current OO file |
|
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance(); |
|
xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE); |
|
xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE); |
|
xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE); |
|
try |
|
{ |
|
this.ioFile=OpenOfficeCalcGraph.this.caclFiles.get(this.fileIndex); |
|
this.zipFile=new ZipFile(this.ioFile); |
|
ZipEntry zipEntry=zipFile.getEntry("content.xml"); |
|
if(zipEntry==null) throw new RuntimeException("Cannot get content.xml"); |
|
this.zipInputStream=this.zipFile.getInputStream(zipEntry); |
|
xmlEventReader= xmlInputFactory.createXMLEventReader(this.zipInputStream); |
|
//describe the file as RDF |
|
this.fileRsrc=Node.createURI(this.ioFile.toURI().toASCIIString()); |
|
add(this.fileRsrc,rdfType,Node.createURI(NS+"Spreadsheet")); |
|
add(this.fileRsrc,Node.createURI(DC.title.getURI()),Node.createLiteral(this.ioFile.getName())); |
|
continue; |
|
} |
|
catch (Exception e) |
|
{ |
|
throw new RuntimeException(e); |
|
} |
|
} |
|
|
|
|
|
if(xmlEventReader.hasNext()) |
|
{ |
|
Attribute att=null; |
|
XMLEvent evt=xmlEventReader.nextEvent(); |
|
if(evt.isStartElement()) |
|
{ |
|
StartElement E=evt.asStartElement(); |
|
if(isA(E,TABLE,"table")) |
|
{ |
|
att=E.getAttributeByName(name); |
|
this.tabIndex++; |
|
//describe the tab as RDF |
|
this.tabRsrc=Node.createURI(this.ioFile.toURI().toASCIIString()+"/t"+tabIndex); |
|
add(this.tabRsrc,Node.createURI(NS+"file"),this.fileRsrc); |
|
add(this.tabRsrc,rdfType,Node.createURI(NS+"Table")); |
|
add(this.tabRsrc,Node.createURI(DC.title.getURI()),Node.createLiteral(att.getValue())); |
|
this.X=0; |
|
this.Y=0; |
|
} |
|
else if(isA(E,TABLE,"table-row")) |
|
{ |
|
//parse the row |
|
Row row=parseRow(E); |
|
//create the statements for that row |
|
for(int i=0;i< row.repeat;++i) |
|
{ |
|
this.X=0; |
|
this.Y++; |
|
for(Cell cell:row.cells) |
|
{ |
|
for(int j=0;j< cell.repeat;++j) |
|
{ |
|
this.X++; |
|
if(cell.value==null && cell.literal==null) continue; |
|
Node subject=Node.createURI(this.ioFile.toURI().toASCIIString()+"/t"+tabIndex+"/y"+Y+"/x"+X); |
|
add(subject,Node.createURI(NS+"table"),this.tabRsrc); |
|
add(subject,rdfType,Node.createURI(NS+"Cell")); |
|
|
|
add(subject,Node.createURI(NS+"X"),Node.createLiteral(String.valueOf(X),null,XSDDatatype.XSDint)); |
|
add(subject,Node.createURI(NS+"Y"),Node.createLiteral(String.valueOf(Y),null,XSDDatatype.XSDint)); |
|
Node cellValue=null; |
|
if(cell.type!=null && cell.value!=null) |
|
{ |
|
XSDDatatype dataType=XSDDatatype.XSDstring; |
|
if(cell.type.equals("float")) |
|
{ |
|
dataType=XSDDatatype.XSDfloat; |
|
} |
|
else if(cell.type.equals("int")) |
|
{ |
|
dataType=XSDDatatype.XSDint; |
|
} |
|
cellValue=Node.createLiteral(cell.value, null, dataType); |
|
} |
|
else |
|
{ |
|
cellValue=Node.createLiteral(String.valueOf(cell.literal)); |
|
} |
|
add( subject, |
|
Node.createURI(NS+"value"), |
|
cellValue |
|
); |
|
|
|
} |
|
} |
|
} |
|
} |
|
} |
|
else if(evt.isEndElement()) |
|
{ |
|
if(isA(evt,TABLE,"table")) |
|
{ |
|
this.tabRsrc=null; |
|
} |
|
} |
|
} |
|
else //we're done for that file. |
|
{ |
|
this.xmlEventReader.close(); |
|
this.zipInputStream.close(); |
|
this.zipFile.close(); |
|
this.xmlEventReader=null; |
|
this.zipInputStream=null; |
|
this.zipFile=null; |
|
this.fileRsrc=null; |
|
this.ioFile=null; |
|
} |
|
} |
|
catch(Exception err) |
|
{ |
|
throw new RuntimeException(err); |
|
} |
|
} |
|
} |
|
return next!=null; |
|
} |
|
|
|
@Override |
|
public void close() |
|
{ |
|
try { if(this.xmlEventReader!=null) this.xmlEventReader.close(); } catch (Exception e) {} |
|
this.xmlEventReader=null; |
|
try { if(this.zipInputStream!=null) this.zipInputStream.close(); } catch (Exception e) {} |
|
this.zipInputStream=null; |
|
try { if(this.zipFile!=null) this.zipFile.close(); } catch (Exception e) {} |
|
this.zipFile=null; |
|
this.buffer.clear(); |
|
this.fileIndex=caclFiles.size(); |
|
} |
|
|
|
@Override |
|
public Triple next() |
|
{ |
|
if(!hasNextCalled) hasNext(); |
|
if(!hasNext()) throw new IllegalStateException(); |
|
Triple t=next; |
|
next=null; |
|
hasNextCalled=false; |
|
return t; |
|
} |
|
|
|
|
|
/** parses a table:table-row */ |
|
private Row parseRow(StartElement root) |
|
throws XMLStreamException |
|
{ |
|
Row row=new Row(); |
|
Attribute att=root.getAttributeByName(number_rows_repeated); |
|
|
|
if(att!=null) |
|
{ |
|
row.repeat=Integer.parseInt(att.getValue()); |
|
} |
|
while(this.xmlEventReader.hasNext()) |
|
{ |
|
XMLEvent evt=this.xmlEventReader.nextEvent(); |
|
if(evt.isStartElement()) |
|
{ |
|
StartElement E=evt.asStartElement(); |
|
if(isA(E,TABLE,"table-cell")) |
|
{ |
|
row.cells.add(parseCell(E)); |
|
} |
|
} |
|
else if(evt.isEndElement()) |
|
{ |
|
if(isA(evt,TABLE,"table-row")) |
|
{ |
|
break; |
|
} |
|
} |
|
} |
|
return row; |
|
} |
|
/** parses a table:table-cell */ |
|
private Cell parseCell(StartElement root) |
|
throws XMLStreamException |
|
{ |
|
Cell cell=new Cell(); |
|
Attribute att=root.getAttributeByName(number_columns_repeated); |
|
|
|
if(att!=null) |
|
{ |
|
cell.repeat=Integer.parseInt(att.getValue()); |
|
} |
|
|
|
att=root.getAttributeByName(value_type); |
|
if(att!=null) |
|
{ |
|
cell.type=att.getValue(); |
|
} |
|
att=root.getAttributeByName(value); |
|
if(att!=null) |
|
{ |
|
cell.value=att.getValue(); |
|
cell.literal=cell.value; |
|
} |
|
|
|
|
|
while(this.xmlEventReader.hasNext()) |
|
{ |
|
XMLEvent evt=this.xmlEventReader.nextEvent(); |
|
if(evt.isStartElement()) |
|
{ |
|
StartElement E=evt.asStartElement(); |
|
if(isA(E,TEXT,"p")) |
|
{ |
|
cell.literal=parseText(E); |
|
} |
|
} |
|
else if(evt.isEndElement()) |
|
{ |
|
if(isA(evt,TABLE,"table-cell")) |
|
{ |
|
break; |
|
} |
|
} |
|
} |
|
return cell; |
|
} |
|
|
|
/** returns the content of <text:p/> */ |
|
private String parseText(StartElement root) |
|
throws XMLStreamException |
|
{ |
|
StringBuilder b=new StringBuilder(); |
|
while(xmlEventReader.hasNext()) |
|
{ |
|
XMLEvent evt=this.xmlEventReader.nextEvent(); |
|
if(evt.isStartElement()) |
|
{ |
|
throw new IllegalStateException(); |
|
} |
|
else if(evt.isEndElement()) |
|
{ |
|
if(isA(evt,TEXT,"p")) |
|
{ |
|
return b.toString(); |
|
} |
|
} |
|
else if(evt.isCharacters()) |
|
{ |
|
b.append(evt.asCharacters().getData()); |
|
} |
|
} |
|
throw new IllegalStateException(); |
|
} |
|
|
|
} |
|
public static void main(String[] args) throws Exception |
|
{ |
|
|
|
if(args.length<2) |
|
{ |
|
System.err.println("Usage: query.sparql file1.ods, file2.ods... filen.ods"); |
|
return; |
|
} |
|
|
|
List<File> files=new ArrayList<File>(); |
|
for(int optind=1;optind< args.length;++optind) |
|
{ |
|
files.add(new File(args[optind])); |
|
} |
|
OpenOfficeCalcGraph g=new OpenOfficeCalcGraph(files); |
|
OpenOfficeCalcModel m=new OpenOfficeCalcModel(g); |
|
|
|
|
|
|
|
com.hp.hpl.jena.query.Query query = QueryFactory.read(args[0]) ; |
|
LOG.info("starting query"); |
|
QueryExecution qexec = QueryExecutionFactory.create(query, m) ; |
|
try { |
|
ResultSet results = qexec.execSelect(); |
|
ResultSetFormatter.out(System.out,results,g.getPrefixMapping()); |
|
|
|
} finally { qexec.close() ; } |
|
|
|
} |
|
} |
|
|