|
/** |
|
* Author: Pierre Lindenbaum PhD |
|
* WWW: http://plindenbaum.blogspot.com |
|
* Motivation: |
|
* Given a gene, identify the world experts |
|
* http://biostar.stackexchange.com/questions/4296 |
|
*/ |
|
import java.net.URLEncoder; |
|
import java.text.Collator; |
|
import java.util.ArrayList; |
|
import java.util.Collections; |
|
import java.util.Comparator; |
|
import java.util.HashSet; |
|
import java.util.List; |
|
import java.util.Locale; |
|
import java.util.Set; |
|
import java.util.TreeSet; |
|
import java.util.logging.Level; |
|
import java.util.logging.Logger; |
|
|
|
import javax.xml.parsers.DocumentBuilder; |
|
import javax.xml.parsers.DocumentBuilderFactory; |
|
import javax.xml.stream.XMLOutputFactory; |
|
import javax.xml.stream.XMLStreamWriter; |
|
import javax.xml.xpath.XPath; |
|
import javax.xml.xpath.XPathConstants; |
|
import javax.xml.xpath.XPathFactory; |
|
|
|
import org.w3c.dom.Document; |
|
import org.w3c.dom.Node; |
|
import org.w3c.dom.NodeList; |
|
|
|
|
|
public class BioStar4296 |
|
{ |
|
private Logger LOG=Logger.getLogger(BioStar4296.class.getName()); |
|
private String organism="Homo Sapiens"; |
|
private DocumentBuilder docBuilder; |
|
private XPath xpath; |
|
private Collator collator; |
|
|
|
|
|
static class Author |
|
{ |
|
String suffix=""; |
|
String firstName=""; |
|
String lastName=""; |
|
String initials=""; |
|
Set<String> mails=new HashSet<String>(); |
|
Set<Integer> pmids=new TreeSet<Integer>(); |
|
int factor=1; |
|
Set<String> affilitations=new HashSet<String>(); |
|
@Override |
|
public int hashCode() { |
|
final int prime = 31; |
|
int result = 1; |
|
result = prime * result |
|
+ ((firstName == null) ? 0 : firstName.hashCode()); |
|
result = prime * result |
|
+ ((lastName == null) ? 0 : lastName.hashCode()); |
|
return result; |
|
} |
|
@Override |
|
public boolean equals(Object obj) { |
|
if (this == obj) |
|
return true; |
|
if (obj == null) |
|
return false; |
|
if (getClass() != obj.getClass()) |
|
return false; |
|
Author other = (Author) obj; |
|
if (firstName == null) { |
|
if (other.firstName != null) |
|
return false; |
|
} else if (!firstName.equals(other.firstName)) |
|
return false; |
|
if (lastName == null) { |
|
if (other.lastName != null) |
|
return false; |
|
} else if (!lastName.equals(other.lastName)) |
|
return false; |
|
return true; |
|
} |
|
@Override |
|
public String toString() { |
|
return firstName+" "+lastName+" lab:"+this.affilitations+" mails:"+this.mails; |
|
} |
|
|
|
void write(XMLStreamWriter w) |
|
throws Exception |
|
{ |
|
w.writeStartElement("Person"); |
|
w.writeCharacters("\n"); |
|
|
|
w.writeStartElement("firstName"); |
|
w.writeCharacters(firstName); |
|
w.writeEndElement(); |
|
w.writeCharacters("\n"); |
|
|
|
w.writeStartElement("lastName"); |
|
w.writeCharacters(lastName); |
|
w.writeEndElement(); |
|
w.writeCharacters("\n"); |
|
|
|
for(Integer s:pmids) |
|
{ |
|
w.writeStartElement("pmid"); |
|
w.writeCharacters(String.valueOf(s)); |
|
w.writeEndElement(); |
|
w.writeCharacters("\n"); |
|
} |
|
|
|
for(String s:mails) |
|
{ |
|
w.writeStartElement("mail"); |
|
w.writeCharacters(s); |
|
w.writeEndElement(); |
|
w.writeCharacters("\n"); |
|
} |
|
for(String s:affilitations) |
|
{ |
|
w.writeStartElement("affilitation"); |
|
w.writeCharacters(s); |
|
w.writeEndElement(); |
|
w.writeCharacters("\n"); |
|
} |
|
|
|
w.writeEndElement(); |
|
w.writeCharacters("\n"); |
|
} |
|
} |
|
|
|
private BioStar4296() throws Exception |
|
{ |
|
LOG.setLevel(Level.OFF); |
|
|
|
DocumentBuilderFactory f=DocumentBuilderFactory.newInstance(); |
|
f.setNamespaceAware(false); |
|
f.setCoalescing(true); |
|
f.setIgnoringComments(true); |
|
f.setIgnoringElementContentWhitespace(true); |
|
f.setValidating(false); |
|
this.docBuilder=f.newDocumentBuilder(); |
|
|
|
XPathFactory factory=XPathFactory.newInstance(); |
|
this.xpath=factory.newXPath(); |
|
|
|
this.collator= Collator.getInstance(Locale.FRENCH); |
|
this.collator.setStrength(Collator.PRIMARY); |
|
} |
|
|
|
private int search(XMLStreamWriter w,String geneName) |
|
throws Exception |
|
{ |
|
w.writeCharacters("\n"); |
|
w.writeStartElement("gene"); |
|
w.writeAttribute("name", geneName); |
|
String url= "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term="+ |
|
URLEncoder.encode(geneName+"[PREF] \""+this.organism+"\"[ORGN]", "UTF-8"); |
|
LOG.info(url); |
|
Document dom=this.docBuilder.parse(url); |
|
NodeList list=(NodeList)this.xpath.evaluate( |
|
"/eSearchResult/IdList/Id", |
|
dom,XPathConstants.NODESET); |
|
if(list.getLength()==0) |
|
{ |
|
w.writeComment("Cannot find any entry for "+geneName); |
|
w.writeEndElement(); |
|
return -1; |
|
} |
|
else if(list.getLength()!=1) |
|
{ |
|
w.writeComment("Ambigous name "+geneName); |
|
w.writeEndElement(); |
|
return -1; |
|
} |
|
String geneId= list.item(0).getTextContent(); |
|
LOG.info("GeneId:"+geneId); |
|
w.writeAttribute("geneId", geneId); |
|
|
|
url="http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id="+ |
|
geneId+ |
|
"&rettype=text&retmode=xml"; |
|
LOG.info(url); |
|
dom=this.docBuilder.parse(url); |
|
list=(NodeList)this.xpath.evaluate( |
|
"//PubMedId", |
|
dom,XPathConstants.NODESET); |
|
if(list.getLength()==0) |
|
{ |
|
w.writeComment("No pubmed for "+geneName); |
|
w.writeEndElement(); |
|
return -1; |
|
} |
|
List<Author> authors=new ArrayList<Author>(); |
|
Set<Integer> pmidSet=new TreeSet<Integer>(); |
|
for(int articleIdx=0;articleIdx< list.getLength();++articleIdx) |
|
{ |
|
String pmid= list.item(articleIdx).getTextContent(); |
|
LOG.info("PMID:"+pmid); |
|
pmidSet.add(Integer.parseInt(pmid)); |
|
} |
|
w.writeAttribute("count-pmids",String.valueOf(pmidSet.size())); |
|
w.writeCharacters("\n"); |
|
|
|
for(Integer pmid: pmidSet) |
|
{ |
|
|
|
LOG.info("PMID:"+pmid); |
|
url="http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id="+pmid+"&retmode=xml"; |
|
LOG.info("url:"+url); |
|
dom=this.docBuilder.parse(url); |
|
Node n=(Node)this.xpath.evaluate("//Affiliation", dom,XPathConstants.NODE); |
|
if(n==null) continue; |
|
String affiliation=n.getTextContent(); |
|
String adressFragments[]=affiliation.split("[ \t\\:\\<,\\>\\(\\)]"); |
|
LOG.info("affiliation:"+affiliation); |
|
NodeList authorList=(NodeList)this.xpath.evaluate( |
|
"//AuthorList/Author", |
|
dom,XPathConstants.NODESET); |
|
LOG.info("Authors:"+authorList.getLength()); |
|
if(authorList.getLength()==0) continue; |
|
|
|
for(int j=0;j< authorList.getLength();++j) |
|
{ |
|
boolean collective=false; |
|
Author author=new Author(); |
|
for(Node c1=authorList.item(j).getFirstChild();c1!=null;c1=c1.getNextSibling()) |
|
{ |
|
if(c1.getNodeType()!=Node.ELEMENT_NODE) continue; |
|
String tag=c1.getNodeName(); |
|
String content= c1.getTextContent(); |
|
if(tag.equals("LastName")) |
|
{ |
|
author.lastName= content; |
|
} |
|
else if(tag.equals("FirstName") || tag.equals("ForeName")) |
|
{ |
|
author.firstName= content; |
|
} |
|
else if(tag.equals("Initials")) |
|
{ |
|
author.initials= content; |
|
} |
|
|
|
else if(tag.equals("CollectiveName")) |
|
{ |
|
collective=true; |
|
break; |
|
} |
|
else if(tag.equals("Suffix")) |
|
{ |
|
author.suffix= content; |
|
} |
|
} |
|
if(collective) continue; |
|
LOG.info("Make New Author:"+author); |
|
int k=0; |
|
for(k=0;k< authors.size();++k) |
|
{ |
|
Author p=authors.get(k); |
|
if( !p.firstName.isEmpty() && |
|
this.collator.compare(p.firstName,author.firstName)==0 && |
|
this.collator.compare(p.lastName,author.lastName)==0) |
|
{ |
|
LOG.info("Same: "+p+" "+author); |
|
author=p; |
|
break; |
|
} |
|
} |
|
|
|
if(k==authors.size()) |
|
{ |
|
k=0; |
|
for(k=0;k< authors.size();++k) |
|
{ |
|
Author p=authors.get(k); |
|
if( |
|
( |
|
(!author.initials.isEmpty() && p.firstName.toLowerCase().startsWith(author.initials.toLowerCase())) || |
|
(!p.initials.isEmpty() && author.firstName.toLowerCase().startsWith(p.initials) )|| |
|
this.collator.compare(p.initials,author.initials)==0 )&& |
|
this.collator.compare(p.lastName,author.lastName)==0) |
|
{ |
|
LOG.info("Same: "+p+" "+author); |
|
if(p.firstName.length()< author.firstName.length()) |
|
{ |
|
p.firstName=author.firstName; |
|
} |
|
author=p; |
|
break; |
|
} |
|
} |
|
} |
|
|
|
if(k==authors.size()) |
|
{ |
|
LOG.info("Adding: "+author); |
|
authors.add(author); |
|
} |
|
author.factor*=j; |
|
author.affilitations.add(affiliation); |
|
author.pmids.add(pmid); |
|
|
|
if(affiliation.indexOf('@')!=-1) |
|
{ |
|
for(String mail: adressFragments) |
|
{ |
|
mail.replaceAll("\\{\\}", ""); |
|
if(mail.endsWith(".")) mail= mail.substring(0,mail.length()-1); |
|
int index=mail.indexOf('@'); |
|
if(index==-1) continue; |
|
String mailPrefix=mail.substring(0,index).toLowerCase(); |
|
|
|
if(mailPrefix.contains(author.lastName.toLowerCase()) || |
|
collator.compare(mailPrefix, author.lastName)==0) |
|
{ |
|
LOG.info("Adding: "+mail+" to "+author); |
|
author.mails.add(mail.toLowerCase()); |
|
} |
|
else if( author.firstName.length()>1 && |
|
(mailPrefix.contains( author.firstName.toLowerCase()) || |
|
collator.compare(mailPrefix, author.firstName)==0)) |
|
{ |
|
LOG.info("Adding: "+mail+" to "+author); |
|
author.mails.add(mail.toLowerCase()); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
if(authors.isEmpty()) |
|
{ |
|
w.writeComment("No Author found"); |
|
w.writeEndElement(); |
|
return -1; |
|
} |
|
|
|
Collections.sort(authors,new Comparator<Author>() |
|
{ |
|
@Override |
|
public int compare(Author o1, Author o2) |
|
{ |
|
int i= o2.pmids.size()-o1.pmids.size(); |
|
if(i!=0) return i; |
|
i= o2.factor-o1.factor;//later is more interesting ? not sure... |
|
return i; |
|
} |
|
}); |
|
|
|
|
|
authors.get(0).write(w); |
|
|
|
w.writeEndElement(); |
|
return 0; |
|
} |
|
|
|
public static void main(String[] args) |
|
{ |
|
try { |
|
BioStar4296 app= new BioStar4296(); |
|
int optind=0; |
|
while(optind<args.length) |
|
{ |
|
if(args[optind].equals("-h")) |
|
{ |
|
System.err.println("Pierre Lindenbaum"); |
|
System.err.println("Options:"); |
|
System.err.println(" -o <organism> ["+app.organism+"]"); |
|
System.err.println(" -v show logs"); |
|
return; |
|
} |
|
else if(args[optind].equals("-o")) |
|
{ |
|
app.organism=args[++optind]; |
|
} |
|
else if(args[optind].equals("-v")) |
|
{ |
|
app.LOG.setLevel(Level.ALL); |
|
} |
|
else if(args[optind].equals("--")) |
|
{ |
|
optind++; |
|
break; |
|
} |
|
else if(args[optind].startsWith("-")) |
|
{ |
|
System.err.println("Unnown option: "+args[optind]); |
|
return; |
|
} |
|
else |
|
{ |
|
break; |
|
} |
|
++optind; |
|
} |
|
if(optind==args.length) |
|
{ |
|
System.err.println("Gene Name missing"); |
|
} |
|
else |
|
{ |
|
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance(); |
|
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(System.out,"UTF-8"); |
|
w.writeStartDocument("UTF-8","1.0"); |
|
w.writeCharacters("\n"); |
|
w.writeStartElement("experts"); |
|
w.writeCharacters("\n"); |
|
while(optind < args.length) |
|
{ |
|
app.search(w,args[optind]); |
|
optind++; |
|
w.writeCharacters("\n"); |
|
} |
|
w.writeEndElement(); |
|
w.writeEndDocument(); |
|
w.flush(); |
|
} |
|
} catch (Exception e) |
|
{ |
|
e.printStackTrace(); |
|
} |
|
} |
|
} |