I'll use some tools I've already described some years ago but I've re-written them.
Downloading the data
To download the paper published in Bioinformatics, the pubmed/entrez query is '"Bioinformatics"[jour]'.I use pubmeddump to download all those articles as XML from pubmed .
java -jar jvarkit/dist/pubmeddump.jar '"Bioinformatics"[jour]'
Adding the authors' gender
PubmedGender is used to add two attributes '@male' or/and '@female' to the Pubmed/XML '<Author>' element.<Author ValidYN="Y" male="169">
<LastName>Lindenbaum</LastName>
<ForeName>Pierre</ForeName>Adding the authors' location
PubmedMap is used to add some attributes to the Pubmed/XML '<Affiliation>' element.<Author>
<LastName>Lai</LastName>
<ForeName>Chih-Cheng</ForeName>
<Initials>CC</Initials>
<AffiliationInfo>
<Affiliation domain="tw" place="Taiwan">Department of Intensive Care Medicine, Chi Mei Medical Center, Liouying, Tainan, Taiwan.</Affiliation>
Extracting the data from XML as a table
I use SAXScript to extract the data from XML.A SAX parser is event-driven parser for XML. Here the events are invoked using a simple javascript program.
The script below will find the sex , the year of publication and the location of each 1st author of each article and print the results as text table.
/** current text content */
var content=null;
/** author position in the article */
var count_authors=0;
/** current author */
var author=null;
/** in element <PubDate> */
var in_pubdate=false;
/** current year */
var year=null;
/** called when a new element XML is found */
function startElement(uri,localName,name,atts)
{
if(name=="PubDate")
{ in_pubdate=true;}
else if(in_pubdate && name=="Year")
{ content="";}
else if(name=="Author" && count_authors==0) {
content="";
/** get sex */
var male = atts.getValue("male");
var female = atts.getValue("female");
var gender = (male==null?(female==null?null:"F"):"M");
/* both male & female ? get the highest score */
if(male!=null && female!=null)
{
var fm= parseInt(male);
var ff= parseInt(female);
gender= (fm>ff?"M":"F");
}
if(gender!=null) author={"sex":gender,"year":year,"domain":null};
}
else if(author!=null && name=="Affiliation") {
author.domain = atts.getValue("domain");
}
}
/** in text node, append the text */
function characters(s)
{
if(content!=null) content+=s;
}
/** end of XML element */
function endElement(uri,localName,name)
{
if(name=="PubDate") { in_pubdate=false;}
else if(in_pubdate && name=="Year") { year=content;}
else if(name=="PubmedArticle" || name=="PubmedBookArticle")
{
count_authors=0;
author=null;
year=null;
in_pubdate=false;
}
else if(name=="Author") {
count_authors++;
/* print first author */
if(author!=null) {
print(author.sex+"\t"+author.year+"\t"+author.domain);
author=null;
}
}
content=null;
}All in one
#download database of names wget -O names.zip "https://www.ssa.gov/oact/babynames/names.zip" unzip -p names.zip yob2015.txt > names.csv rm names.zip java -jar jvarkit/dist/pubmeddump.jar '"Bioinformatics"[jour]' |\ java -jar jvarkit/dist/pubmedgender.jar -d names.csv |\ java -jar jvarkit/dist/pubmedmap.jar |\ java -jar src/jsandbox/dist/saxscript.jar -f pubmed.js > data.csv
The output (count, sex , year , country ):
$ cat data.csv | sort | uniq -c | sort -n
(...)
105 M 2015 us
107 M 2004 us
107 M 2013 us
115 M 2008 us
117 M 2011 us
120 M 2009 us
122 M 2010 us
126 M 2014 us
130 M 2012 us
139 M 2005 usThat's it, Pierre
No comments:
Post a Comment