14 December 2010

Looking for an expert ?

Yesterday, Andrew Su asked on Biostar: "Given a gene, what is the best automated method to identify the world experts? ".

Here is my solution:

  • First for a given gene name, we use NCBI-ESearch to find its Gene-Id in NCBI Gene
  • The Gene record is then downloaded as XML using NCBI-EFetch
  • XPATH is used to retrieve all the articles in pubmed about this gene and identified by the XML tags <PubMedId>
  • Each article is downloaded from pubmed. The element <Affiliation> is extracted from the record; sometimes this tag contains the the main contact's email. The authors are also extracted and we count the number of times each author was found. I tried to solve the problem of ambiguity for the names of the authors by looking at the name, surname and initials. If an author's name was contained in the e-mail, it was affected to him
  • At the end, all the authors are sorted in function of the number of times they were seen and the most prolific author is printed out.


Source code

/**
* Author: Pierre Lindenbaum PhD
* WWW: http://plindenbaum.blogspot.com
* Motivation:
* Given a gene, identify the world experts
* http://biostar.stackexchange.com/questions/4296
*/
import java.net.URLEncoder;
import java.text.Collator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
public class BioStar4296
{
private Logger LOG=Logger.getLogger(BioStar4296.class.getName());
private String organism="Homo Sapiens";
private DocumentBuilder docBuilder;
private XPath xpath;
private Collator collator;
static class Author
{
String suffix="";
String firstName="";
String lastName="";
String initials="";
Set<String> mails=new HashSet<String>();
Set<Integer> pmids=new TreeSet<Integer>();
int factor=1;
Set<String> affilitations=new HashSet<String>();
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result
+ ((firstName == null) ? 0 : firstName.hashCode());
result = prime * result
+ ((lastName == null) ? 0 : lastName.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Author other = (Author) obj;
if (firstName == null) {
if (other.firstName != null)
return false;
} else if (!firstName.equals(other.firstName))
return false;
if (lastName == null) {
if (other.lastName != null)
return false;
} else if (!lastName.equals(other.lastName))
return false;
return true;
}
@Override
public String toString() {
return firstName+" "+lastName+" lab:"+this.affilitations+" mails:"+this.mails;
}
void write(XMLStreamWriter w)
throws Exception
{
w.writeStartElement("Person");
w.writeCharacters("\n");
w.writeStartElement("firstName");
w.writeCharacters(firstName);
w.writeEndElement();
w.writeCharacters("\n");
w.writeStartElement("lastName");
w.writeCharacters(lastName);
w.writeEndElement();
w.writeCharacters("\n");
for(Integer s:pmids)
{
w.writeStartElement("pmid");
w.writeCharacters(String.valueOf(s));
w.writeEndElement();
w.writeCharacters("\n");
}
for(String s:mails)
{
w.writeStartElement("mail");
w.writeCharacters(s);
w.writeEndElement();
w.writeCharacters("\n");
}
for(String s:affilitations)
{
w.writeStartElement("affilitation");
w.writeCharacters(s);
w.writeEndElement();
w.writeCharacters("\n");
}
w.writeEndElement();
w.writeCharacters("\n");
}
}
private BioStar4296() throws Exception
{
LOG.setLevel(Level.OFF);
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance();
f.setNamespaceAware(false);
f.setCoalescing(true);
f.setIgnoringComments(true);
f.setIgnoringElementContentWhitespace(true);
f.setValidating(false);
this.docBuilder=f.newDocumentBuilder();
XPathFactory factory=XPathFactory.newInstance();
this.xpath=factory.newXPath();
this.collator= Collator.getInstance(Locale.FRENCH);
this.collator.setStrength(Collator.PRIMARY);
}
private int search(XMLStreamWriter w,String geneName)
throws Exception
{
w.writeCharacters("\n");
w.writeStartElement("gene");
w.writeAttribute("name", geneName);
String url= "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term="+
URLEncoder.encode(geneName+"[PREF] \""+this.organism+"\"[ORGN]", "UTF-8");
LOG.info(url);
Document dom=this.docBuilder.parse(url);
NodeList list=(NodeList)this.xpath.evaluate(
"/eSearchResult/IdList/Id",
dom,XPathConstants.NODESET);
if(list.getLength()==0)
{
w.writeComment("Cannot find any entry for "+geneName);
w.writeEndElement();
return -1;
}
else if(list.getLength()!=1)
{
w.writeComment("Ambigous name "+geneName);
w.writeEndElement();
return -1;
}
String geneId= list.item(0).getTextContent();
LOG.info("GeneId:"+geneId);
w.writeAttribute("geneId", geneId);
url="http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id="+
geneId+
"&rettype=text&retmode=xml";
LOG.info(url);
dom=this.docBuilder.parse(url);
list=(NodeList)this.xpath.evaluate(
"//PubMedId",
dom,XPathConstants.NODESET);
if(list.getLength()==0)
{
w.writeComment("No pubmed for "+geneName);
w.writeEndElement();
return -1;
}
List<Author> authors=new ArrayList<Author>();
Set<Integer> pmidSet=new TreeSet<Integer>();
for(int articleIdx=0;articleIdx< list.getLength();++articleIdx)
{
String pmid= list.item(articleIdx).getTextContent();
LOG.info("PMID:"+pmid);
pmidSet.add(Integer.parseInt(pmid));
}
w.writeAttribute("count-pmids",String.valueOf(pmidSet.size()));
w.writeCharacters("\n");
for(Integer pmid: pmidSet)
{
LOG.info("PMID:"+pmid);
url="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id="+pmid+"&retmode=xml";
LOG.info("url:"+url);
dom=this.docBuilder.parse(url);
Node n=(Node)this.xpath.evaluate("//Affiliation", dom,XPathConstants.NODE);
if(n==null) continue;
String affiliation=n.getTextContent();
String adressFragments[]=affiliation.split("[ \t\\:\\<,\\>\\(\\)]");
LOG.info("affiliation:"+affiliation);
NodeList authorList=(NodeList)this.xpath.evaluate(
"//AuthorList/Author",
dom,XPathConstants.NODESET);
LOG.info("Authors:"+authorList.getLength());
if(authorList.getLength()==0) continue;
for(int j=0;j< authorList.getLength();++j)
{
boolean collective=false;
Author author=new Author();
for(Node c1=authorList.item(j).getFirstChild();c1!=null;c1=c1.getNextSibling())
{
if(c1.getNodeType()!=Node.ELEMENT_NODE) continue;
String tag=c1.getNodeName();
String content= c1.getTextContent();
if(tag.equals("LastName"))
{
author.lastName= content;
}
else if(tag.equals("FirstName") || tag.equals("ForeName"))
{
author.firstName= content;
}
else if(tag.equals("Initials"))
{
author.initials= content;
}
else if(tag.equals("CollectiveName"))
{
collective=true;
break;
}
else if(tag.equals("Suffix"))
{
author.suffix= content;
}
}
if(collective) continue;
LOG.info("Make New Author:"+author);
int k=0;
for(k=0;k< authors.size();++k)
{
Author p=authors.get(k);
if( !p.firstName.isEmpty() &&
this.collator.compare(p.firstName,author.firstName)==0 &&
this.collator.compare(p.lastName,author.lastName)==0)
{
LOG.info("Same: "+p+" "+author);
author=p;
break;
}
}
if(k==authors.size())
{
k=0;
for(k=0;k< authors.size();++k)
{
Author p=authors.get(k);
if(
(
(!author.initials.isEmpty() && p.firstName.toLowerCase().startsWith(author.initials.toLowerCase())) ||
(!p.initials.isEmpty() && author.firstName.toLowerCase().startsWith(p.initials) )||
this.collator.compare(p.initials,author.initials)==0 )&&
this.collator.compare(p.lastName,author.lastName)==0)
{
LOG.info("Same: "+p+" "+author);
if(p.firstName.length()< author.firstName.length())
{
p.firstName=author.firstName;
}
author=p;
break;
}
}
}
if(k==authors.size())
{
LOG.info("Adding: "+author);
authors.add(author);
}
author.factor*=j;
author.affilitations.add(affiliation);
author.pmids.add(pmid);
if(affiliation.indexOf('@')!=-1)
{
for(String mail: adressFragments)
{
mail.replaceAll("\\{\\}", "");
if(mail.endsWith(".")) mail= mail.substring(0,mail.length()-1);
int index=mail.indexOf('@');
if(index==-1) continue;
String mailPrefix=mail.substring(0,index).toLowerCase();
if(mailPrefix.contains(author.lastName.toLowerCase()) ||
collator.compare(mailPrefix, author.lastName)==0)
{
LOG.info("Adding: "+mail+" to "+author);
author.mails.add(mail.toLowerCase());
}
else if( author.firstName.length()>1 &&
(mailPrefix.contains( author.firstName.toLowerCase()) ||
collator.compare(mailPrefix, author.firstName)==0))
{
LOG.info("Adding: "+mail+" to "+author);
author.mails.add(mail.toLowerCase());
}
}
}
}
}
if(authors.isEmpty())
{
w.writeComment("No Author found");
w.writeEndElement();
return -1;
}
Collections.sort(authors,new Comparator<Author>()
{
@Override
public int compare(Author o1, Author o2)
{
int i= o2.pmids.size()-o1.pmids.size();
if(i!=0) return i;
i= o2.factor-o1.factor;//later is more interesting ? not sure...
return i;
}
});
authors.get(0).write(w);
w.writeEndElement();
return 0;
}
public static void main(String[] args)
{
try {
BioStar4296 app= new BioStar4296();
int optind=0;
while(optind<args.length)
{
if(args[optind].equals("-h"))
{
System.err.println("Pierre Lindenbaum");
System.err.println("Options:");
System.err.println(" -o <organism> ["+app.organism+"]");
System.err.println(" -v show logs");
return;
}
else if(args[optind].equals("-o"))
{
app.organism=args[++optind];
}
else if(args[optind].equals("-v"))
{
app.LOG.setLevel(Level.ALL);
}
else if(args[optind].equals("--"))
{
optind++;
break;
}
else if(args[optind].startsWith("-"))
{
System.err.println("Unnown option: "+args[optind]);
return;
}
else
{
break;
}
++optind;
}
if(optind==args.length)
{
System.err.println("Gene Name missing");
}
else
{
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance();
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(System.out,"UTF-8");
w.writeStartDocument("UTF-8","1.0");
w.writeCharacters("\n");
w.writeStartElement("experts");
w.writeCharacters("\n");
while(optind < args.length)
{
app.search(w,args[optind]);
optind++;
w.writeCharacters("\n");
}
w.writeEndElement();
w.writeEndDocument();
w.flush();
}
} catch (Exception e)
{
e.printStackTrace();
}
}
}

Compilation

javac BioStar4296.java

Test

java BioStar4296 ZC3H7B eif4G1 PRNP

<?xml version="1.0" encoding="UTF-8"?>
<experts>
<gene name="ZC3H7B" geneId="23264" count-pmids="13">
<Person>
<firstName>Sumio</firstName>
<lastName>Sugano</lastName>
<pmid>8125298</pmid>
<pmid>9373149</pmid>
<pmid>14702039</pmid>
<affilitation>International and Interdisciplinary Studies, The University of Tokyo, Japan.</affilitation>
<affilitation>Institute of Medical Science, University of Tokyo, Japan.</affilitation>
<affilitation>Helix Research Institute, 1532-3 Yana, Kisarazu, Chiba 292-0812, Japan.</affilitation>
</Person>
</gene>
<gene name="eif4G1" geneId="1981" count-pmids="106">
<Person>
<firstName>Nahum</firstName>
<lastName>Sonenberg</lastName>
<pmid>7651417</pmid>
<pmid>7935836</pmid>
<pmid>8449919</pmid>
(...)
<affilitation>Department of Biochemistry and McGill Cancer Center, McGill University, Montreal, H3G 1Y6, Quebec, Canada.</affilitation>
<affilitation>Department of Biochemistry, McGill University, Montreal, Quebec, Canada.</affilitation>
<affilitation>Laboratories of Molecular Biophysics, The Rockefeller University, New York, New York 10021, USA.</affilitation>
(...)
</Person>
</gene>
<gene name="PRNP" geneId="5621" count-pmids="429">
<Person>
<firstName>John</firstName>
<lastName>Collinge</lastName>
<pmid>1352724</pmid>
<pmid>1677164</pmid>
<pmid>2159587</pmid>
<pmid>20583301</pmid>
(...)
<mail>j.collinge@ic.ac.uk</mail>
<affilitation>Krebs Institute for Biomolecular Research, Department of Molecular Biology and Biotechnology, University of Sheffield, Sheffield S10 2TN, UK.</affilitation>
<affilitation>MRC Prion Unit and Department of Neurogenetics, Imperial College School of Medicine at St. Mary's, London, United Kingdom. J.Collinge@ic.ac.uk</affilitation>
<affilitation>Division of Neuroscience (Neurophysiology), Medical School, University of Birmingham, Edgbaston, Birmingham, UK. sratte@pitt.edu</affilitation>
(...)
</Person>
</gene>
</experts>

about this result


  • ZC3H7B the result is wrong. In Dr Sugano's article (3 articles) ZC3H7B was present in among a large set of other genes used in his studies. The expert would be Dr D. Poncet, my former thesis advisor but he 'only' wrote two articles about this protein.
  • Eif4G1: I know Dr Sonenberg is the expert. His email wasn't found.
  • PRNP Collinge seems to be the expert. Dr Collinge's e-mail was detected.


That's it,

Pierre

1 comment:

Matt Hodgkinson said...

Summary tools like PubReMiner, Anne O'Tate or Scopus can do this for those not inclined towards coding.