17 February 2010

The path from EgonWillighagen to Jandot : Neo4j , a graph API for java: my notebook.

neo4j "Neo4j is a graph database. It is an embedded, disk-based, fully transactional Java persistence engine that stores data structured in graphs rather than in tables. A graph (mathematical lingo for a network) is a flexible data structure that allows a more agile and rapid style of development.".
In the current post, I'll use the neo4j API to load a set of pubmed entries and find the shortest path between two authors.
First, a few constants are defined:

private static final String NODE_TYPE="node-type";
private static final String TYPE_ARTICLE="article";
private static final String TYPE_AUTHOR="author";
private static final String PROPERTY_PMID="pmid";
private static final String PROPERTY_NAME="name";
private static final String PROPERTY_TITLE="title";
We also need a Neo4J embedded graph as well as two indexes that will be used to quickly find if an article or an author have already been inserted in the graph:
//this graph will be stored in /tmp/neo4j
private GraphDatabaseService graphDB =new EmbeddedGraphDatabase("/tmp/neo4j");
private IndexService findByPmid= new LuceneIndexService( graphDB );
private IndexService findByName= new LuceneIndexService( graphDB );
when the program exists, all those objects have to be closed:
private void close()
{
if(findByPmid!=null)
{
findByPmid.shutdown();
findByPmid=null;
}
if(findByName!=null)
{
findByName.shutdown();
findByName=null;
}
if(graphDB!=null)
{
graphDB.shutdown();
}
graphDB=null;
}
A Stax parser is used to read the XML pubmed entries. For this test, I've loaded a bunch of articles from 'Nature' and from the Biogang. Each time a new pubmed identifier (PMID) is found, the Node is searched in the findByPmid index, else a new node is created:

Transaction txn=graphDB.beginTx();
(...)
if(name.equals("PMID") && article==null)
{
Integer pmid=new Integer(reader.getElementText());
article= findByPmid.getSingleNode(PROPERTY_PMID, pmid);

//article was not already in the graph
if(article==null)
{
article= this.graphDB.createNode();
article.setProperty(NODE_TYPE, TYPE_ARTICLE);
article.setProperty(PROPERTY_PMID, pmid);
findByPmid.index(article, PROPERTY_PMID, pmid);
}
}
(..)
txn.success();
Likewise, each time a new pubmed author (PMID) is found, the Node is searched in the findByName index, else a new node is created.
while(!(evt=reader.nextEvent()).isEndDocument())
{
if(evt.isEndElement())
{
if(initials.isEmpty() && !firstName.isEmpty()) initials=""+firstName.charAt(0);
String s= initials+" "+middle+" "+lastName+" "+suffix;
String name= s.replaceAll("[ ]+", " ").trim();
if(name.isEmpty()) return null;
Node author=findByName.getSingleNode(PROPERTY_NAME, name);
if(author==null)
{
author= this.graphDB.createNode();
author.setProperty(NODE_TYPE, TYPE_AUTHOR);
author.setProperty(PROPERTY_NAME, name);
findByName.index(author, PROPERTY_NAME, name);
}
return author;
}
if(!evt.isStartElement()) continue;
String tag= evt.asStartElement().getName().getLocalPart();
String content= reader.getElementText().trim();
if(tag.equals("LastName"))
{
lastName= content;
}
else if(tag.equals("FirstName") || tag.equals("ForeName"))
{
firstName= content;
}
else if(tag.equals("Initials"))
{
initials= content;
}
else if(tag.equals("MiddleName"))
{
middle= content;
}
else if(tag.equals("CollectiveName"))
{
return null;
}
else if(tag.equals("Suffix"))
{
suffix= content;
}
For each author in an Article, a Relationship is created:
author.createRelationshipTo(article, PubmedRelations.IS_AUTHOR_OF);
Now we can find the Path between Jan Aerts and Egon Willighagen (ok, those author could be some homonyms, we need a unique identifier for the Authors ! ).
txn=this.graphDB.beginTx();
Node startAuthor= findByName.getSingleNode(PROPERTY_NAME, "J Aerts");
Node endAuthor= findByName.getSingleNode(PROPERTY_NAME, "EL Willighagen");
SingleSourceShortestPat<Integer> pathBFS;
pathBFS = new SingleSourceShortestPathBFS(
startAuthor,
Direction.BOTH,
PubmedRelations.IS_AUTHOR_OF
);

for(Node n: pathBFS.getPathAsNodes(endAuthor))
{
echo(n);
}

txn.finish();

Result:

  1. J Aerts
  2. "A physical map of the chicken genome."
  3. RK Wilson
  4. "Initial sequencing and comparative analysis of the mouse genome"
  5. RB Weiss
  6. "Independent evolution of bitter-taste sensitivity in humans and chimpanzees"
  7. MT Howard
  8. "The Blue Obelisk-interoperability in chemical informatics"
  9. EL Willighagen

Source code


import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;

import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

import org.neo4j.graphalgo.shortestpath.SingleSourceShortestPath;
import org.neo4j.graphalgo.shortestpath.SingleSourceShortestPathBFS;
import org.neo4j.graphdb.Direction;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.RelationshipType;
import org.neo4j.graphdb.Transaction;
import org.neo4j.index.IndexService;
import org.neo4j.index.lucene.LuceneIndexService;
import org.neo4j.kernel.EmbeddedGraphDatabase;

public class PubmedGraph
{
private GraphDatabaseService graphDB;
private IndexService findByPmid;
private IndexService findByName;

private static final String NODE_TYPE="node-type";
private static final String TYPE_ARTICLE="article";
private static final String TYPE_AUTHOR="author";
private static final String PROPERTY_PMID="pmid";
private static final String PROPERTY_NAME="name";
private static final String PROPERTY_TITLE="title";

private static enum PubmedRelations
implements RelationshipType
{
IS_AUTHOR_OF
}



private PubmedGraph()
{

}
private void open()
{
close();
graphDB=new EmbeddedGraphDatabase("/tmp/neo4j");
findByPmid= new LuceneIndexService( graphDB );
findByName= new LuceneIndexService( graphDB );
}
private void close()
{
if(findByPmid!=null)
{
findByPmid.shutdown();
findByPmid=null;
}
if(findByName!=null)
{
findByName.shutdown();
findByName=null;
}
if(graphDB!=null)
{
graphDB.shutdown();
}
graphDB=null;
}

private boolean read(InputStream in)
{
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
XMLEventReader reader=null;

Transaction txn=null;
try {
reader= xmlInputFactory.createXMLEventReader(in);
txn = graphDB.beginTx();

Node article=null;
while(reader.hasNext())
{
XMLEvent event = reader.nextEvent();
if(event.isStartElement())
{
StartElement e=event.asStartElement();
String name=e.getName().getLocalPart();
if(name.equals("PubmedArticle"))
{
article=null;
}
else if(name.equals("ArticleTitle") && article!=null && !article.hasProperty(PROPERTY_TITLE))
{
article.setProperty(PROPERTY_TITLE, reader.getElementText());
}
else if(name.equals("PMID") && article==null)
{
Integer pmid=new Integer(reader.getElementText());

article= findByPmid.getSingleNode(PROPERTY_PMID, pmid);


if(article==null)
{
article= this.graphDB.createNode();
article.setProperty(NODE_TYPE, TYPE_ARTICLE);
article.setProperty(PROPERTY_PMID, pmid);
findByPmid.index(article, PROPERTY_PMID, pmid);
}
}
else if(name.equals("Author") && article!=null)
{
Node author= author(reader);
if(author!=null)
{
author.createRelationshipTo(article, PubmedRelations.IS_AUTHOR_OF);
}
}
}
else if(event.isEndElement())
{
EndElement e=event.asEndElement();
String name=e.getName().getLocalPart();
if(name.equals("PubmedArticle"))
{
article=null;
}
}
}
txn.success();
return true;
}
catch (Exception e)
{
e.printStackTrace();
return false;
}
finally
{
if(txn!=null) txn.finish();
if(reader!=null) try {reader.close();}catch(Exception e2){}
}

}
private Node author(XMLEventReader reader) throws IOException,XMLStreamException
{
String lastName="";
String firstName="";
String initials="";
String middle="";
String suffix="";
XMLEvent evt;
while(!(evt=reader.nextEvent()).isEndDocument())
{
if(evt.isEndElement())
{
if(initials.isEmpty() && !firstName.isEmpty()) initials=""+firstName.charAt(0);
String s= initials+" "+middle+" "+lastName+" "+suffix;
String name= s.replaceAll("[ ]+", " ").trim();
if(name.isEmpty()) return null;
Node author=findByName.getSingleNode(PROPERTY_NAME, name);
if(author==null)
{
author= this.graphDB.createNode();
author.setProperty(NODE_TYPE, TYPE_AUTHOR);
author.setProperty(PROPERTY_NAME, name);
findByName.index(author, PROPERTY_NAME, name);
}
return author;
}
if(!evt.isStartElement()) continue;
String tag= evt.asStartElement().getName().getLocalPart();
String content= reader.getElementText().trim();
if(tag.equals("LastName"))
{
lastName= content;
}
else if(tag.equals("FirstName") || tag.equals("ForeName"))
{
firstName= content;
}
else if(tag.equals("Initials"))
{
initials= content;
}
else if(tag.equals("MiddleName"))
{
middle= content;
}
else if(tag.equals("CollectiveName"))
{
return null;
}
else if(tag.equals("Suffix"))
{
suffix= content;
}

else
{
//"###ignoring "+tag+"="+content);
}
}
throw new IOException("Cannot parse Author");
}

public void echo(Node n)
{
System.out.println("nodeId."+n.getId());
for(String p: n.getPropertyKeys())
{
System.out.println(" "+p+":\t"+n.getProperty(p));
}
System.out.println();
}

public void dump()
{
System.out.println("Dump:");
Transaction txn=null;
try {
txn=this.graphDB.beginTx();
for(Node n: this.graphDB.getAllNodes())
{
echo(n);
}
} finally
{
txn.finish();
}

}



private void search()
{
Transaction txn=null;
try {
txn=this.graphDB.beginTx();
Node startAuthor= findByName.getSingleNode(PROPERTY_NAME, "J Aerts");
if(startAuthor==null) { System.err.println("No found;");return;}
Node endAuthor= findByName.getSingleNode(PROPERTY_NAME, "EL Willighagen");

if(endAuthor==null) return;

System.err.println("Waking paths");
SingleSourceShortestPath<Integer> pathBFS;
pathBFS = new SingleSourceShortestPathBFS(
startAuthor,
Direction.BOTH,
PubmedRelations.IS_AUTHOR_OF
);


for(Node n: pathBFS.getPathAsNodes(endAuthor))
{
echo(n);
}

System.err.println("Done");


} finally
{
txn.finish();
}

}


public static void main(String[] args)
{

PubmedGraph app=new PubmedGraph();
try {
int optind=0;

app.open();


while(optind< args.length)
{
InputStream in= new FileInputStream(args[optind++]);
app.read(in);
in.close();
}


app.search();
} catch (Exception e) {
e.printStackTrace();
}
finally
{
if(app!=null) app.close();
}
}
}


That's it
Pierre

2 comments:

Peter Neubauer said...

Very cool!

Regarding relationship finding, Marko Rodriguez, Author of Gremlin, has a great Article on scholarly recommnedations in graphs

Cheers,

peter

Matte said...

Great post,

However I feel compelled to point out that your code instantiates two LuceneIndexService instances. Whereas it's OK, it isn't necessary to have more than one LuceneIndexService for any given GraphDatabaseService.

Best,
Mattias