17 February 2010

The path from EgonWillighagen to Jandot : Neo4j , a graph API for java: my notebook.

neo4j "Neo4j is a graph database. It is an embedded, disk-based, fully transactional Java persistence engine that stores data structured in graphs rather than in tables. A graph (mathematical lingo for a network) is a flexible data structure that allows a more agile and rapid style of development.".
In the current post, I'll use the neo4j API to load a set of pubmed entries and find the shortest path between two authors.
First, a few constants are defined:
private static final String NODE_TYPE="node-type";
private static final String TYPE_ARTICLE="article";
private static final String TYPE_AUTHOR="author";
private static final String PROPERTY_PMID="pmid";
private static final String PROPERTY_NAME="name";
private static final String PROPERTY_TITLE="title";
We also need a Neo4J embedded graph as well as two indexes that will be used to quickly find if an article or an author have already been inserted in the graph:
//this graph will be stored in /tmp/neo4j
private GraphDatabaseService graphDB =new EmbeddedGraphDatabase("/tmp/neo4j");
private IndexService findByPmid= new LuceneIndexService( graphDB );
private IndexService findByName= new LuceneIndexService( graphDB );
when the program exists, all those objects have to be closed:
private void close()
{
if(findByPmid!=null)
{
findByPmid.shutdown();
findByPmid=null;
}
if(findByName!=null)
{
findByName.shutdown();
findByName=null;
}
if(graphDB!=null)
{
graphDB.shutdown();
}
graphDB=null;
}
A Stax parser is used to read the XML pubmed entries. For this test, I've loaded a bunch of articles from 'Nature' and from the Biogang. Each time a new pubmed identifier (PMID) is found, the Node is searched in the findByPmid index, else a new node is created:

Transaction txn=graphDB.beginTx();
(...)
if(name.equals("PMID") && article==null)
{
Integer pmid=new Integer(reader.getElementText());
article= findByPmid.getSingleNode(PROPERTY_PMID, pmid);

//article was not already in the graph
if(article==null)
{
article= this.graphDB.createNode();
article.setProperty(NODE_TYPE, TYPE_ARTICLE);
article.setProperty(PROPERTY_PMID, pmid);
findByPmid.index(article, PROPERTY_PMID, pmid);
}
}
(..)
txn.success();
Likewise, each time a new pubmed author (PMID) is found, the Node is searched in the findByName index, else a new node is created.
while(!(evt=reader.nextEvent()).isEndDocument())
{
if(evt.isEndElement())
{
if(initials.isEmpty() && !firstName.isEmpty()) initials=""+firstName.charAt(0);
String s= initials+" "+middle+" "+lastName+" "+suffix;
String name= s.replaceAll("[ ]+", " ").trim();
if(name.isEmpty()) return null;
Node author=findByName.getSingleNode(PROPERTY_NAME, name);
if(author==null)
{
author= this.graphDB.createNode();
author.setProperty(NODE_TYPE, TYPE_AUTHOR);
author.setProperty(PROPERTY_NAME, name);
findByName.index(author, PROPERTY_NAME, name);
}
return author;
}
if(!evt.isStartElement()) continue;
String tag= evt.asStartElement().getName().getLocalPart();
String content= reader.getElementText().trim();
if(tag.equals("LastName"))
{
lastName= content;
}
else if(tag.equals("FirstName") || tag.equals("ForeName"))
{
firstName= content;
}
else if(tag.equals("Initials"))
{
initials= content;
}
else if(tag.equals("MiddleName"))
{
middle= content;
}
else if(tag.equals("CollectiveName"))
{
return null;
}
else if(tag.equals("Suffix"))
{
suffix= content;
}
For each author in an Article, a Relationship is created:
author.createRelationshipTo(article, PubmedRelations.IS_AUTHOR_OF);
Now we can find the Path between Jan Aerts and Egon Willighagen (ok, those author could be some homonyms, we need a unique identifier for the Authors ! ).
txn=this.graphDB.beginTx();
Node startAuthor= findByName.getSingleNode(PROPERTY_NAME, "J Aerts");
Node endAuthor= findByName.getSingleNode(PROPERTY_NAME, "EL Willighagen");
SingleSourceShortestPat<Integer> pathBFS;
pathBFS = new SingleSourceShortestPathBFS(
startAuthor,
Direction.BOTH,
PubmedRelations.IS_AUTHOR_OF
);

for(Node n: pathBFS.getPathAsNodes(endAuthor))
{
echo(n);
}

txn.finish();

Result:

  1. J Aerts
  2. "A physical map of the chicken genome."
  3. RK Wilson
  4. "Initial sequencing and comparative analysis of the mouse genome"
  5. RB Weiss
  6. "Independent evolution of bitter-taste sensitivity in humans and chimpanzees"
  7. MT Howard
  8. "The Blue Obelisk-interoperability in chemical informatics"
  9. EL Willighagen

Source code


import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.List;

import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

import org.neo4j.graphalgo.shortestpath.SingleSourceShortestPath;
import org.neo4j.graphalgo.shortestpath.SingleSourceShortestPathBFS;
import org.neo4j.graphdb.Direction;
import org.neo4j.graphdb.GraphDatabaseService;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.RelationshipType;
import org.neo4j.graphdb.Transaction;
import org.neo4j.index.IndexService;
import org.neo4j.index.lucene.LuceneIndexService;
import org.neo4j.kernel.EmbeddedGraphDatabase;

public class PubmedGraph
{
private GraphDatabaseService graphDB;
private IndexService findByPmid;
private IndexService findByName;

private static final String NODE_TYPE="node-type";
private static final String TYPE_ARTICLE="article";
private static final String TYPE_AUTHOR="author";
private static final String PROPERTY_PMID="pmid";
private static final String PROPERTY_NAME="name";
private static final String PROPERTY_TITLE="title";

private static enum PubmedRelations
implements RelationshipType
{
IS_AUTHOR_OF
}



private PubmedGraph()
{

}
private void open()
{
close();
graphDB=new EmbeddedGraphDatabase("/tmp/neo4j");
findByPmid= new LuceneIndexService( graphDB );
findByName= new LuceneIndexService( graphDB );
}
private void close()
{
if(findByPmid!=null)
{
findByPmid.shutdown();
findByPmid=null;
}
if(findByName!=null)
{
findByName.shutdown();
findByName=null;
}
if(graphDB!=null)
{
graphDB.shutdown();
}
graphDB=null;
}

private boolean read(InputStream in)
{
XMLInputFactory xmlInputFactory = XMLInputFactory.newInstance();
xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.FALSE);
xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
XMLEventReader reader=null;

Transaction txn=null;
try {
reader= xmlInputFactory.createXMLEventReader(in);
txn = graphDB.beginTx();

Node article=null;
while(reader.hasNext())
{
XMLEvent event = reader.nextEvent();
if(event.isStartElement())
{
StartElement e=event.asStartElement();
String name=e.getName().getLocalPart();
if(name.equals("PubmedArticle"))
{
article=null;
}
else if(name.equals("ArticleTitle") && article!=null && !article.hasProperty(PROPERTY_TITLE))
{
article.setProperty(PROPERTY_TITLE, reader.getElementText());
}
else if(name.equals("PMID") && article==null)
{
Integer pmid=new Integer(reader.getElementText());

article= findByPmid.getSingleNode(PROPERTY_PMID, pmid);


if(article==null)
{
article= this.graphDB.createNode();
article.setProperty(NODE_TYPE, TYPE_ARTICLE);
article.setProperty(PROPERTY_PMID, pmid);
findByPmid.index(article, PROPERTY_PMID, pmid);
}
}
else if(name.equals("Author") && article!=null)
{
Node author= author(reader);
if(author!=null)
{
author.createRelationshipTo(article, PubmedRelations.IS_AUTHOR_OF);
}
}
}
else if(event.isEndElement())
{
EndElement e=event.asEndElement();
String name=e.getName().getLocalPart();
if(name.equals("PubmedArticle"))
{
article=null;
}
}
}
txn.success();
return true;
}
catch (Exception e)
{
e.printStackTrace();
return false;
}
finally
{
if(txn!=null) txn.finish();
if(reader!=null) try {reader.close();}catch(Exception e2){}
}

}
private Node author(XMLEventReader reader) throws IOException,XMLStreamException
{
String lastName="";
String firstName="";
String initials="";
String middle="";
String suffix="";
XMLEvent evt;
while(!(evt=reader.nextEvent()).isEndDocument())
{
if(evt.isEndElement())
{
if(initials.isEmpty() && !firstName.isEmpty()) initials=""+firstName.charAt(0);
String s= initials+" "+middle+" "+lastName+" "+suffix;
String name= s.replaceAll("[ ]+", " ").trim();
if(name.isEmpty()) return null;
Node author=findByName.getSingleNode(PROPERTY_NAME, name);
if(author==null)
{
author= this.graphDB.createNode();
author.setProperty(NODE_TYPE, TYPE_AUTHOR);
author.setProperty(PROPERTY_NAME, name);
findByName.index(author, PROPERTY_NAME, name);
}
return author;
}
if(!evt.isStartElement()) continue;
String tag= evt.asStartElement().getName().getLocalPart();
String content= reader.getElementText().trim();
if(tag.equals("LastName"))
{
lastName= content;
}
else if(tag.equals("FirstName") || tag.equals("ForeName"))
{
firstName= content;
}
else if(tag.equals("Initials"))
{
initials= content;
}
else if(tag.equals("MiddleName"))
{
middle= content;
}
else if(tag.equals("CollectiveName"))
{
return null;
}
else if(tag.equals("Suffix"))
{
suffix= content;
}

else
{
//"###ignoring "+tag+"="+content);
}
}
throw new IOException("Cannot parse Author");
}

public void echo(Node n)
{
System.out.println("nodeId."+n.getId());
for(String p: n.getPropertyKeys())
{
System.out.println(" "+p+":\t"+n.getProperty(p));
}
System.out.println();
}

public void dump()
{
System.out.println("Dump:");
Transaction txn=null;
try {
txn=this.graphDB.beginTx();
for(Node n: this.graphDB.getAllNodes())
{
echo(n);
}
} finally
{
txn.finish();
}

}



private void search()
{
Transaction txn=null;
try {
txn=this.graphDB.beginTx();
Node startAuthor= findByName.getSingleNode(PROPERTY_NAME, "J Aerts");
if(startAuthor==null) { System.err.println("No found;");return;}
Node endAuthor= findByName.getSingleNode(PROPERTY_NAME, "EL Willighagen");

if(endAuthor==null) return;

System.err.println("Waking paths");
SingleSourceShortestPath<Integer> pathBFS;
pathBFS = new SingleSourceShortestPathBFS(
startAuthor,
Direction.BOTH,
PubmedRelations.IS_AUTHOR_OF
);


for(Node n: pathBFS.getPathAsNodes(endAuthor))
{
echo(n);
}

System.err.println("Done");


} finally
{
txn.finish();
}

}


public static void main(String[] args)
{

PubmedGraph app=new PubmedGraph();
try {
int optind=0;

app.open();


while(optind< args.length)
{
InputStream in= new FileInputStream(args[optind++]);
app.read(in);
in.close();
}


app.search();
} catch (Exception e) {
e.printStackTrace();
}
finally
{
if(app!=null) app.close();
}
}
}


That's it
Pierre

2 comments:

  1. Very cool!

    Regarding relationship finding, Marko Rodriguez, Author of Gremlin, has a great Article on scholarly recommnedations in graphs

    Cheers,

    peter

    ReplyDelete
  2. Great post,

    However I feel compelled to point out that your code instantiates two LuceneIndexService instances. Whereas it's OK, it isn't necessary to have more than one LuceneIndexService for any given GraphDatabaseService.

    Best,
    Mattias

    ReplyDelete