Looking for an expert ?
Yesterday, Andrew Su asked on Biostar: "Given a gene, what is the best automated method to identify the world experts? ".
Here is my solution:
- First for a given gene name, we use NCBI-ESearch to find its Gene-Id in NCBI Gene
- The Gene record is then downloaded as XML using NCBI-EFetch
- XPATH is used to retrieve all the articles in pubmed about this gene and identified by the XML tags <PubMedId>
- Each article is downloaded from pubmed. The element <Affiliation> is extracted from the record; sometimes this tag contains the the main contact's email. The authors are also extracted and we count the number of times each author was found. I tried to solve the problem of ambiguity for the names of the authors by looking at the name, surname and initials. If an author's name was contained in the e-mail, it was affected to him
- At the end, all the authors are sorted in function of the number of times they were seen and the most prolific author is printed out.
Source code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Author: Pierre Lindenbaum PhD | |
* WWW: http://plindenbaum.blogspot.com | |
* Motivation: | |
* Given a gene, identify the world experts | |
* http://biostar.stackexchange.com/questions/4296 | |
*/ | |
import java.net.URLEncoder; | |
import java.text.Collator; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.Comparator; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.Locale; | |
import java.util.Set; | |
import java.util.TreeSet; | |
import java.util.logging.Level; | |
import java.util.logging.Logger; | |
import javax.xml.parsers.DocumentBuilder; | |
import javax.xml.parsers.DocumentBuilderFactory; | |
import javax.xml.stream.XMLOutputFactory; | |
import javax.xml.stream.XMLStreamWriter; | |
import javax.xml.xpath.XPath; | |
import javax.xml.xpath.XPathConstants; | |
import javax.xml.xpath.XPathFactory; | |
import org.w3c.dom.Document; | |
import org.w3c.dom.Node; | |
import org.w3c.dom.NodeList; | |
public class BioStar4296 | |
{ | |
private Logger LOG=Logger.getLogger(BioStar4296.class.getName()); | |
private String organism="Homo Sapiens"; | |
private DocumentBuilder docBuilder; | |
private XPath xpath; | |
private Collator collator; | |
static class Author | |
{ | |
String suffix=""; | |
String firstName=""; | |
String lastName=""; | |
String initials=""; | |
Set<String> mails=new HashSet<String>(); | |
Set<Integer> pmids=new TreeSet<Integer>(); | |
int factor=1; | |
Set<String> affilitations=new HashSet<String>(); | |
@Override | |
public int hashCode() { | |
final int prime = 31; | |
int result = 1; | |
result = prime * result | |
+ ((firstName == null) ? 0 : firstName.hashCode()); | |
result = prime * result | |
+ ((lastName == null) ? 0 : lastName.hashCode()); | |
return result; | |
} | |
@Override | |
public boolean equals(Object obj) { | |
if (this == obj) | |
return true; | |
if (obj == null) | |
return false; | |
if (getClass() != obj.getClass()) | |
return false; | |
Author other = (Author) obj; | |
if (firstName == null) { | |
if (other.firstName != null) | |
return false; | |
} else if (!firstName.equals(other.firstName)) | |
return false; | |
if (lastName == null) { | |
if (other.lastName != null) | |
return false; | |
} else if (!lastName.equals(other.lastName)) | |
return false; | |
return true; | |
} | |
@Override | |
public String toString() { | |
return firstName+" "+lastName+" lab:"+this.affilitations+" mails:"+this.mails; | |
} | |
void write(XMLStreamWriter w) | |
throws Exception | |
{ | |
w.writeStartElement("Person"); | |
w.writeCharacters("\n"); | |
w.writeStartElement("firstName"); | |
w.writeCharacters(firstName); | |
w.writeEndElement(); | |
w.writeCharacters("\n"); | |
w.writeStartElement("lastName"); | |
w.writeCharacters(lastName); | |
w.writeEndElement(); | |
w.writeCharacters("\n"); | |
for(Integer s:pmids) | |
{ | |
w.writeStartElement("pmid"); | |
w.writeCharacters(String.valueOf(s)); | |
w.writeEndElement(); | |
w.writeCharacters("\n"); | |
} | |
for(String s:mails) | |
{ | |
w.writeStartElement("mail"); | |
w.writeCharacters(s); | |
w.writeEndElement(); | |
w.writeCharacters("\n"); | |
} | |
for(String s:affilitations) | |
{ | |
w.writeStartElement("affilitation"); | |
w.writeCharacters(s); | |
w.writeEndElement(); | |
w.writeCharacters("\n"); | |
} | |
w.writeEndElement(); | |
w.writeCharacters("\n"); | |
} | |
} | |
private BioStar4296() throws Exception | |
{ | |
LOG.setLevel(Level.OFF); | |
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance(); | |
f.setNamespaceAware(false); | |
f.setCoalescing(true); | |
f.setIgnoringComments(true); | |
f.setIgnoringElementContentWhitespace(true); | |
f.setValidating(false); | |
this.docBuilder=f.newDocumentBuilder(); | |
XPathFactory factory=XPathFactory.newInstance(); | |
this.xpath=factory.newXPath(); | |
this.collator= Collator.getInstance(Locale.FRENCH); | |
this.collator.setStrength(Collator.PRIMARY); | |
} | |
private int search(XMLStreamWriter w,String geneName) | |
throws Exception | |
{ | |
w.writeCharacters("\n"); | |
w.writeStartElement("gene"); | |
w.writeAttribute("name", geneName); | |
String url= "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term="+ | |
URLEncoder.encode(geneName+"[PREF] \""+this.organism+"\"[ORGN]", "UTF-8"); | |
LOG.info(url); | |
Document dom=this.docBuilder.parse(url); | |
NodeList list=(NodeList)this.xpath.evaluate( | |
"/eSearchResult/IdList/Id", | |
dom,XPathConstants.NODESET); | |
if(list.getLength()==0) | |
{ | |
w.writeComment("Cannot find any entry for "+geneName); | |
w.writeEndElement(); | |
return -1; | |
} | |
else if(list.getLength()!=1) | |
{ | |
w.writeComment("Ambigous name "+geneName); | |
w.writeEndElement(); | |
return -1; | |
} | |
String geneId= list.item(0).getTextContent(); | |
LOG.info("GeneId:"+geneId); | |
w.writeAttribute("geneId", geneId); | |
url="http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id="+ | |
geneId+ | |
"&rettype=text&retmode=xml"; | |
LOG.info(url); | |
dom=this.docBuilder.parse(url); | |
list=(NodeList)this.xpath.evaluate( | |
"//PubMedId", | |
dom,XPathConstants.NODESET); | |
if(list.getLength()==0) | |
{ | |
w.writeComment("No pubmed for "+geneName); | |
w.writeEndElement(); | |
return -1; | |
} | |
List<Author> authors=new ArrayList<Author>(); | |
Set<Integer> pmidSet=new TreeSet<Integer>(); | |
for(int articleIdx=0;articleIdx< list.getLength();++articleIdx) | |
{ | |
String pmid= list.item(articleIdx).getTextContent(); | |
LOG.info("PMID:"+pmid); | |
pmidSet.add(Integer.parseInt(pmid)); | |
} | |
w.writeAttribute("count-pmids",String.valueOf(pmidSet.size())); | |
w.writeCharacters("\n"); | |
for(Integer pmid: pmidSet) | |
{ | |
LOG.info("PMID:"+pmid); | |
url="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id="+pmid+"&retmode=xml"; | |
LOG.info("url:"+url); | |
dom=this.docBuilder.parse(url); | |
Node n=(Node)this.xpath.evaluate("//Affiliation", dom,XPathConstants.NODE); | |
if(n==null) continue; | |
String affiliation=n.getTextContent(); | |
String adressFragments[]=affiliation.split("[ \t\\:\\<,\\>\\(\\)]"); | |
LOG.info("affiliation:"+affiliation); | |
NodeList authorList=(NodeList)this.xpath.evaluate( | |
"//AuthorList/Author", | |
dom,XPathConstants.NODESET); | |
LOG.info("Authors:"+authorList.getLength()); | |
if(authorList.getLength()==0) continue; | |
for(int j=0;j< authorList.getLength();++j) | |
{ | |
boolean collective=false; | |
Author author=new Author(); | |
for(Node c1=authorList.item(j).getFirstChild();c1!=null;c1=c1.getNextSibling()) | |
{ | |
if(c1.getNodeType()!=Node.ELEMENT_NODE) continue; | |
String tag=c1.getNodeName(); | |
String content= c1.getTextContent(); | |
if(tag.equals("LastName")) | |
{ | |
author.lastName= content; | |
} | |
else if(tag.equals("FirstName") || tag.equals("ForeName")) | |
{ | |
author.firstName= content; | |
} | |
else if(tag.equals("Initials")) | |
{ | |
author.initials= content; | |
} | |
else if(tag.equals("CollectiveName")) | |
{ | |
collective=true; | |
break; | |
} | |
else if(tag.equals("Suffix")) | |
{ | |
author.suffix= content; | |
} | |
} | |
if(collective) continue; | |
LOG.info("Make New Author:"+author); | |
int k=0; | |
for(k=0;k< authors.size();++k) | |
{ | |
Author p=authors.get(k); | |
if( !p.firstName.isEmpty() && | |
this.collator.compare(p.firstName,author.firstName)==0 && | |
this.collator.compare(p.lastName,author.lastName)==0) | |
{ | |
LOG.info("Same: "+p+" "+author); | |
author=p; | |
break; | |
} | |
} | |
if(k==authors.size()) | |
{ | |
k=0; | |
for(k=0;k< authors.size();++k) | |
{ | |
Author p=authors.get(k); | |
if( | |
( | |
(!author.initials.isEmpty() && p.firstName.toLowerCase().startsWith(author.initials.toLowerCase())) || | |
(!p.initials.isEmpty() && author.firstName.toLowerCase().startsWith(p.initials) )|| | |
this.collator.compare(p.initials,author.initials)==0 )&& | |
this.collator.compare(p.lastName,author.lastName)==0) | |
{ | |
LOG.info("Same: "+p+" "+author); | |
if(p.firstName.length()< author.firstName.length()) | |
{ | |
p.firstName=author.firstName; | |
} | |
author=p; | |
break; | |
} | |
} | |
} | |
if(k==authors.size()) | |
{ | |
LOG.info("Adding: "+author); | |
authors.add(author); | |
} | |
author.factor*=j; | |
author.affilitations.add(affiliation); | |
author.pmids.add(pmid); | |
if(affiliation.indexOf('@')!=-1) | |
{ | |
for(String mail: adressFragments) | |
{ | |
mail.replaceAll("\\{\\}", ""); | |
if(mail.endsWith(".")) mail= mail.substring(0,mail.length()-1); | |
int index=mail.indexOf('@'); | |
if(index==-1) continue; | |
String mailPrefix=mail.substring(0,index).toLowerCase(); | |
if(mailPrefix.contains(author.lastName.toLowerCase()) || | |
collator.compare(mailPrefix, author.lastName)==0) | |
{ | |
LOG.info("Adding: "+mail+" to "+author); | |
author.mails.add(mail.toLowerCase()); | |
} | |
else if( author.firstName.length()>1 && | |
(mailPrefix.contains( author.firstName.toLowerCase()) || | |
collator.compare(mailPrefix, author.firstName)==0)) | |
{ | |
LOG.info("Adding: "+mail+" to "+author); | |
author.mails.add(mail.toLowerCase()); | |
} | |
} | |
} | |
} | |
} | |
if(authors.isEmpty()) | |
{ | |
w.writeComment("No Author found"); | |
w.writeEndElement(); | |
return -1; | |
} | |
Collections.sort(authors,new Comparator<Author>() | |
{ | |
@Override | |
public int compare(Author o1, Author o2) | |
{ | |
int i= o2.pmids.size()-o1.pmids.size(); | |
if(i!=0) return i; | |
i= o2.factor-o1.factor;//later is more interesting ? not sure... | |
return i; | |
} | |
}); | |
authors.get(0).write(w); | |
w.writeEndElement(); | |
return 0; | |
} | |
public static void main(String[] args) | |
{ | |
try { | |
BioStar4296 app= new BioStar4296(); | |
int optind=0; | |
while(optind<args.length) | |
{ | |
if(args[optind].equals("-h")) | |
{ | |
System.err.println("Pierre Lindenbaum"); | |
System.err.println("Options:"); | |
System.err.println(" -o <organism> ["+app.organism+"]"); | |
System.err.println(" -v show logs"); | |
return; | |
} | |
else if(args[optind].equals("-o")) | |
{ | |
app.organism=args[++optind]; | |
} | |
else if(args[optind].equals("-v")) | |
{ | |
app.LOG.setLevel(Level.ALL); | |
} | |
else if(args[optind].equals("--")) | |
{ | |
optind++; | |
break; | |
} | |
else if(args[optind].startsWith("-")) | |
{ | |
System.err.println("Unnown option: "+args[optind]); | |
return; | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
if(optind==args.length) | |
{ | |
System.err.println("Gene Name missing"); | |
} | |
else | |
{ | |
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance(); | |
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(System.out,"UTF-8"); | |
w.writeStartDocument("UTF-8","1.0"); | |
w.writeCharacters("\n"); | |
w.writeStartElement("experts"); | |
w.writeCharacters("\n"); | |
while(optind < args.length) | |
{ | |
app.search(w,args[optind]); | |
optind++; | |
w.writeCharacters("\n"); | |
} | |
w.writeEndElement(); | |
w.writeEndDocument(); | |
w.flush(); | |
} | |
} catch (Exception e) | |
{ | |
e.printStackTrace(); | |
} | |
} | |
} |
Compilation
javac BioStar4296.java
Test
java BioStar4296 ZC3H7B eif4G1 PRNP
<?xml version="1.0" encoding="UTF-8"?>
<experts>
<gene name="ZC3H7B" geneId="23264" count-pmids="13">
<Person>
<firstName>Sumio</firstName>
<lastName>Sugano</lastName>
<pmid>8125298</pmid>
<pmid>9373149</pmid>
<pmid>14702039</pmid>
<affilitation>International and Interdisciplinary Studies, The University of Tokyo, Japan.</affilitation>
<affilitation>Institute of Medical Science, University of Tokyo, Japan.</affilitation>
<affilitation>Helix Research Institute, 1532-3 Yana, Kisarazu, Chiba 292-0812, Japan.</affilitation>
</Person>
</gene>
<gene name="eif4G1" geneId="1981" count-pmids="106">
<Person>
<firstName>Nahum</firstName>
<lastName>Sonenberg</lastName>
<pmid>7651417</pmid>
<pmid>7935836</pmid>
<pmid>8449919</pmid>
(...)
<affilitation>Department of Biochemistry and McGill Cancer Center, McGill University, Montreal, H3G 1Y6, Quebec, Canada.</affilitation>
<affilitation>Department of Biochemistry, McGill University, Montreal, Quebec, Canada.</affilitation>
<affilitation>Laboratories of Molecular Biophysics, The Rockefeller University, New York, New York 10021, USA.</affilitation>
(...)
</Person>
</gene>
<gene name="PRNP" geneId="5621" count-pmids="429">
<Person>
<firstName>John</firstName>
<lastName>Collinge</lastName>
<pmid>1352724</pmid>
<pmid>1677164</pmid>
<pmid>2159587</pmid>
<pmid>20583301</pmid>
(...)
<mail>j.collinge@ic.ac.uk</mail>
<affilitation>Krebs Institute for Biomolecular Research, Department of Molecular Biology and Biotechnology, University of Sheffield, Sheffield S10 2TN, UK.</affilitation>
<affilitation>MRC Prion Unit and Department of Neurogenetics, Imperial College School of Medicine at St. Mary's, London, United Kingdom. J.Collinge@ic.ac.uk</affilitation>
<affilitation>Division of Neuroscience (Neurophysiology), Medical School, University of Birmingham, Edgbaston, Birmingham, UK. sratte@pitt.edu</affilitation>
(...)
</Person>
</gene>
</experts>
<?xml version="1.0" encoding="UTF-8"?>
<experts>
<gene name="ZC3H7B" geneId="23264" count-pmids="13">
<Person>
<firstName>Sumio</firstName>
<lastName>Sugano</lastName>
<pmid>8125298</pmid>
<pmid>9373149</pmid>
<pmid>14702039</pmid>
<affilitation>International and Interdisciplinary Studies, The University of Tokyo, Japan.</affilitation>
<affilitation>Institute of Medical Science, University of Tokyo, Japan.</affilitation>
<affilitation>Helix Research Institute, 1532-3 Yana, Kisarazu, Chiba 292-0812, Japan.</affilitation>
</Person>
</gene>
<gene name="eif4G1" geneId="1981" count-pmids="106">
<Person>
<firstName>Nahum</firstName>
<lastName>Sonenberg</lastName>
<pmid>7651417</pmid>
<pmid>7935836</pmid>
<pmid>8449919</pmid>
(...)
<affilitation>Department of Biochemistry and McGill Cancer Center, McGill University, Montreal, H3G 1Y6, Quebec, Canada.</affilitation>
<affilitation>Department of Biochemistry, McGill University, Montreal, Quebec, Canada.</affilitation>
<affilitation>Laboratories of Molecular Biophysics, The Rockefeller University, New York, New York 10021, USA.</affilitation>
(...)
</Person>
</gene>
<gene name="PRNP" geneId="5621" count-pmids="429">
<Person>
<firstName>John</firstName>
<lastName>Collinge</lastName>
<pmid>1352724</pmid>
<pmid>1677164</pmid>
<pmid>2159587</pmid>
<pmid>20583301</pmid>
(...)
<mail>j.collinge@ic.ac.uk</mail>
<affilitation>Krebs Institute for Biomolecular Research, Department of Molecular Biology and Biotechnology, University of Sheffield, Sheffield S10 2TN, UK.</affilitation>
<affilitation>MRC Prion Unit and Department of Neurogenetics, Imperial College School of Medicine at St. Mary's, London, United Kingdom. J.Collinge@ic.ac.uk</affilitation>
<affilitation>Division of Neuroscience (Neurophysiology), Medical School, University of Birmingham, Edgbaston, Birmingham, UK. sratte@pitt.edu</affilitation>
(...)
</Person>
</gene>
</experts>
about this result
- ZC3H7B the result is wrong. In Dr Sugano's article (3 articles) ZC3H7B was present in among a large set of other genes used in his studies. The expert would be Dr D. Poncet, my former thesis advisor but he 'only' wrote two articles about this protein.
- Eif4G1: I know Dr Sonenberg is the expert. His email wasn't found.
- PRNP Collinge seems to be the expert. Dr Collinge's e-mail was detected.
That's it,
Pierre
1 comment:
Summary tools like PubReMiner, Anne O'Tate or Scopus can do this for those not inclined towards coding.
Post a Comment