22 September 2010

A Simple tool to get the sex ratio in pubmed.

Just for fun, I wrote a simple java tool to get the sex ratio of the authors in Pubmed. This program fetches a list of names/genders I found in the following perl module: http://cpansearch.perl.org/src/EDALY/Text-GenderFromName-0.33/GenderFromName.pm. The source code is available at

.

(In the following examples, the many names that couldn't be associated to a gender were ignored).

Bioinformatics


Here is the result for "Bioinformatics[journal]"
Women: 3178 (19%) Men: 13149 (80%)
Bioinformatics[Journal]


The 'Lancet' in 2009

Women: 579 (30%) Men: 1331 (69%)
Lancet[Journal] 2009[Date]


Nature in 2009

Women: 1616 (30%) Men: 3768 (69%)
Nature[Journal] 2009[Date]


Nursing in 2009

Women: 29 (70%) Men: 12 (29%)
Nursing[Journal] 2009[Date]



Articles about Charles Darwin

Women: 25 (17%) Men: 118 (82%)
"Darwin C"[PS]



etc... etc..

Source code

/**
* Author:
* Pierre Lindenbaum PhD
* plindenbaum@yahoo.fr
* Source of data:
* http://cpansearch.perl.org/src/EDALY/Text-GenderFromName-0.33/GenderFromName.pm
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLEncoder;
import java.text.Collator;
import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.stream.events.XMLEvent;

/**
* PubmedGender
*/
public class PubmedGender
{
private Map<String,Float> males=null;
private Map<String,Float> females=null;
private int limit=1000;
private String query="";
private int canvasSize=200;
private boolean ignoreUndefined=false;
private PubmedGender()
{
Collator collator= Collator.getInstance(Locale.US);
collator.setStrength(Collator.PRIMARY);
this.males=new TreeMap<String, Float>(collator);
this.females=new TreeMap<String, Float>(collator);
}

private void loadNames()
throws IOException
{
BufferedReader in=new BufferedReader(new InputStreamReader(new URL("http://cpansearch.perl.org/src/EDALY/Text-GenderFromName-0.33/GenderFromName.pm").openStream()));
String line;
Map<String,Float> map=null;
int posAssign=-1;
while((line=in.readLine())!=null)
{
if(line.startsWith("$Males = {"))
{
map=this.males;
}
else if(line.startsWith("$Females = {"))
{
map=this.females;
}
else if(line.contains("}"))
{
map=null;
}
else if(map!=null && ((posAssign=line.indexOf("=>"))!=-1))
{
String name=line.substring(0,posAssign).replaceAll("'","").toLowerCase().trim();
Float freq=Float.parseFloat(line.substring(posAssign+2).replaceAll("[',]","").toLowerCase().trim());
map.put(name, freq);
}
else
{
map=null;
}
}
in.close();
}
private XMLEventReader newReader(URL url) throws IOException,XMLStreamException
{
XMLInputFactory f= XMLInputFactory.newInstance();
f.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
f.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE,Boolean.FALSE);
f.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES,Boolean.TRUE);
f.setProperty(XMLInputFactory.IS_VALIDATING,Boolean.FALSE);
f.setProperty(XMLInputFactory.SUPPORT_DTD,Boolean.FALSE);
XMLEventReader reader=f.createXMLEventReader(url.openStream());
return reader;
}

private void run() throws Exception
{
int countMales=0;
int countFemales=0;
int countUnknown=0;

URL url= new URL(
"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term="+
URLEncoder.encode(this.query, "UTF-8")+
"&retstart=0&retmax="+this.limit+"&usehistory=y&retmode=xml&email=plindenbaum_at_yahoo.fr&tool=gender");

XMLEventReader reader= newReader(url);
XMLEvent evt;
String QueryKey=null;
String WebEnv=null;
int countId=0;
while(!(evt=reader.nextEvent()).isEndDocument())
{
if(!evt.isStartElement()) continue;
String tag= evt.asStartElement().getName().getLocalPart();
if(tag.equals("QueryKey"))
{
QueryKey= reader.getElementText().trim();
}
else if(tag.equals("WebEnv"))
{
WebEnv= reader.getElementText().trim();
}
else if(tag.equals("Id"))
{
++countId;
}
}
reader.close();

if(countId!=0)
{
url= new URL("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&WebEnv="+
URLEncoder.encode(WebEnv,"UTF-8")+
"&query_key="+URLEncoder.encode(QueryKey,"UTF-8")+
"&retmode=xml&retmax="+this.limit+"&email=plindenbaum_at_yahoo.fr&tool=mail");

reader= newReader(url);


while(reader.hasNext())
{
evt=reader.nextEvent();
if(!evt.isStartElement()) continue;
if(!evt.asStartElement().getName().getLocalPart().equals("Author")) continue;
String firstName=null;
String initials=null;

while(reader.hasNext())
{
evt=reader.nextEvent();
if(evt.isStartElement())
{
String localName=evt.asStartElement().getName().getLocalPart();
if(localName.equals("ForeName") || localName.equals("FirstName"))
{
firstName=reader.getElementText().toLowerCase();
}
else if(localName.equals("Initials"))
{
initials=reader.getElementText().toLowerCase();
}
}
else if(evt.isEndElement())
{
if(evt.asEndElement().getName().getLocalPart().equals("Author")) break;
}
}
if( firstName==null ) continue;
if( firstName.length()==1 ||
firstName.equals(initials)) continue;

String tokens[]=firstName.split("[ ]+");
firstName="";
for(String s:tokens)
{
if(s.length()> firstName.length())
{
firstName=s;
}
}


if( firstName.length()==1 ||
firstName.equals(initials)) continue;

Float male= this.males.get(firstName);
Float female= this.females.get(firstName);

if(male==null && female==null)
{
//System.err.println("Undefined "+firstName+" / "+lastName);
countUnknown++;
}
else if(male!=null && female==null)
{
countMales++;
}
else if(male==null && female!=null)
{
countFemales++;
}
else if(male < female)
{
countFemales++;
}
else if(female < male)
{
countMales++;
}
else
{
//System.err.println("Undefined "+firstName+" / "+lastName);
countUnknown++;
}
}
reader.close();
}
if(ignoreUndefined) countUnknown=0;

float total= countMales+countFemales+countUnknown;

double radMale=(countMales/total)*Math.PI*2.0;
double radFemale=(countFemales/total)*Math.PI*2.0;
int radius= (canvasSize-2)/2;
String id= "ctx"+System.currentTimeMillis()+""+(int)(Math.random()*1000);
XMLOutputFactory xmlfactory= XMLOutputFactory.newInstance();
XMLStreamWriter w= xmlfactory.createXMLStreamWriter(System.out,"UTF-8");
w.writeStartElement("html");
w.writeStartElement("body");
w.writeStartElement("div");
w.writeAttribute("style","margin:10px;padding:10px;text-align:center;");
w.writeStartElement("div");
w.writeEmptyElement("canvas");
w.writeAttribute("width", String.valueOf(canvasSize+1));
w.writeAttribute("height", String.valueOf(canvasSize+1));
w.writeAttribute("id", id);
w.writeStartElement("script");
w.writeCharacters(
"function paint"+id+"(){var canvas=document.getElementById('"+id+"');"+
"if (!canvas.getContext) return;var c=canvas.getContext('2d');"+
"c.fillStyle='white';c.strokeStyle='black';"+
"c.fillRect(0,0,"+canvasSize+","+canvasSize+");"+
"c.fillStyle='gray';c.beginPath();c.arc("+(canvasSize/2)+","+(canvasSize/2)+","+radius+",0,Math.PI*2,true);c.fill();c.stroke();"+
"c.fillStyle='blue';c.beginPath();c.moveTo("+(canvasSize/2)+","+(canvasSize/2)+");c.arc("+(canvasSize/2)+","+(canvasSize/2)+","+radius+",0,"+radMale+",false);c.closePath();c.fill();c.stroke();"+
"c.fillStyle='pink';c.beginPath();c.moveTo("+(canvasSize/2)+","+(canvasSize/2)+");c.arc("+(canvasSize/2)+","+(canvasSize/2)+","+radius+","+radMale+","+(radMale+radFemale)+",false);c.closePath();c.fill();c.stroke();}"+
"window.addEventListener('load',function(){ paint"+id+"(); },true);"
);
w.writeEndElement();
w.writeEndElement();

w.writeStartElement("span");
w.writeAttribute("style","color:pink;");
w.writeCharacters("Women: "+countFemales+" ("+(int)((countFemales/total)*100.0)+"%)");
w.writeEndElement();
w.writeCharacters(" ");
w.writeStartElement("span");
w.writeAttribute("style","color:blue;");
w.writeCharacters("Men: "+countMales+" ("+(int)((countMales/total)*100.0)+"%)");
w.writeEndElement();
w.writeCharacters(" ");

if(!this.ignoreUndefined)
{
w.writeStartElement("span");
w.writeAttribute("style","color:gray;");
w.writeCharacters("Undefined : "+countUnknown+" ("+(int)((countUnknown/total)*100.0)+"%)");
w.writeEndElement();
}
w.writeEmptyElement("br");

w.writeStartElement("a");
w.writeAttribute("target","_blank");
w.writeAttribute("href","http://www.ncbi.nlm.nih.gov/sites/entrez?db=pubmed&amp;cmd=search&amp;term="+URLEncoder.encode(this.query,"UTF-8"));
w.writeCharacters(this.query);
w.writeEndElement();


w.writeEndElement();
w.writeEndElement();
w.writeEndElement();
w.flush();
w.close();
}

public static void main(String[] args)
{
try
{
PubmedGender app=new PubmedGender();

int optind=0;
while(optind< args.length)
{
if(args[optind].equals("-h") ||
args[optind].equals("-help") ||
args[optind].equals("--help"))
{
System.err.println("Options:");
System.err.println(" -h help; This screen.");
System.err.println(" -w <int> canvas size default:"+app.canvasSize);
System.err.println(" -L <int> limit number default:"+app.limit);
System.err.println(" -i ignore undefined default:"+app.ignoreUndefined);
System.err.println(" query terms...");
return;
}
else if(args[optind].equals("-L"))
{
app.limit=Integer.parseInt(args[++optind]);
}
else if(args[optind].equals("-w"))
{
app.canvasSize=Integer.parseInt(args[++optind]);
}
else if(args[optind].equals("-i"))
{
app.ignoreUndefined=true;
}
else if(args[optind].equals("--"))
{
optind++;
break;
}
else if(args[optind].startsWith("-"))
{
System.err.println("Unknown option "+args[optind]);
return;
}
else
{
break;
}
++optind;
}
if(optind==args.length)
{
System.err.println("Query missing");
return;
}
app.query="";
while(optind< args.length)
{
if(!app.query.isEmpty()) app.query+=" ";
app.query+=args[optind++];
}
app.query=app.query.trim();
if(app.query.trim().isEmpty())
{
System.err.println("Query is empty");
return;
}
app.loadNames();

app.run();

}
catch (Exception e)
{
e.printStackTrace();
}
}
}


That's it

Pierre

4 comments:

martha said...

This is good, I like it very much, great ionformation and is very easy tool to get the ratio.

bioinformatics training india

Larry_Parnell said...

This looks quite interesting. I wonder if you can comment on the parsing of sex for Chinese or Indonesian (often a single name) authors. Granted there are not many articles from Indonesia in PubMed, but names from some Asian countries might be a challenge. Thanks!

Pierre Lindenbaum said...

Larry, the source of data only contains the American names, that is why many authors haven't been mapped to a gender.

Laozi said...

A further filter like nationality may make it more interesting.