18 February 2011

A Data Scraper for Amazonia (expression)


Today, we had a lecture about the "human induced pluripotent stem cells", presented by John De Vos. He introduced Amazonia, a free web atlas that allows an easy query of public human transcriptome data. Although there is no web service (REST/SOAP) to access this data, I was interested in getting some profiles of expression from this database as it is something I've failed to achieve with NCBI/GEO.

I wrote the following java scraper:

/**
* Author:
* Pierre Lindenbaum PhD
* Mail:
* plindenbaum@yahoo.fr
* WWW:
* http://plindenbaum.blogspot.com
* Motivation:
* A scraper for http://amazonia.transcriptome.eu
* Compilation:
* javac AmazoniaRobot
* Execution
* java AmazoniaRobot EIF4G1
*
*/
import java.awt.BorderLayout;
import java.awt.Dimension;
import java.awt.Image;
import java.awt.Toolkit;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import javax.swing.ImageIcon;
import javax.swing.JLabel;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.swing.JScrollPane;
import javax.swing.JTabbedPane;
import javax.swing.border.EmptyBorder;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Attr;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
public class AmazoniaRobot
{
private static final URL BASE;
static {
try
{
BASE=new URL("http://amazonia.transcriptome.eu");
}
catch(IOException err)
{
throw new RuntimeException("Bad base URL");
}
}
private static class Profile
{
String name;
Set<URL> imgs=new HashSet<URL>();
}
private List<Profile> search(String query) throws Exception
{
List<Profile> profiles=new ArrayList<Profile>();
URL url=new URL(BASE,"search.php?searchField=alias&id="+URLEncoder.encode(query, "UTF-8"));
HttpURLConnection con=(HttpURLConnection)url.openConnection();
con.connect();
con.setInstanceFollowRedirects(false);
String location=con.getHeaderField("Location");
con.disconnect();
if(location==null) return null;
url=new URL(BASE,location);
con=(HttpURLConnection)url.openConnection();
con.connect();
InputStream in=con.getInputStream();
String html=toString(in);
in.close();
con.disconnect();
//fix xhtml
int n=-1;
while((n=html.indexOf("<meta",n+1))!=-1)
{
int n2=html.indexOf('>',n+1);
if(n2==-1) break;
if(html.charAt(n2-1)!='/')
{
html=html.substring(0,n2)+"/"+html.substring(n2);
}
n=n2;
}
html=html.replace("&", "&amp;");
DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance();
factory.setCoalescing(true);
factory.setNamespaceAware(false);
factory.setExpandEntityReferences(true);
factory.setValidating(false);
factory.setIgnoringComments(true);
factory.setIgnoringElementContentWhitespace(true);
DocumentBuilder builder=factory.newDocumentBuilder();
builder.setEntityResolver(new EntityResolver()
{
@Override
public InputSource resolveEntity(String publicId, String systemId)
throws SAXException, IOException
{
return new InputSource(new StringReader(""));
}
});
Document dom=builder.parse(new InputSource(new StringReader(html)));
XPath xpath=XPathFactory.newInstance().newXPath();
NodeList L=(NodeList)xpath.evaluate("//div[@class='fieldTitle']", dom, XPathConstants.NODESET);
for(int i=0;i< L.getLength();++i)
{
Profile profile=new Profile();
profile.name=L.item(i).getTextContent();
for(Node n1=L.item(i).getNextSibling();n1!=null;n1=n1.getNextSibling())
{
NodeList L2=(NodeList)xpath.evaluate(".//a[starts-with(@href,'http:temp/histo_')]",n1, XPathConstants.NODESET);
for(int j=0;j< L2.getLength();++j)
{
String href=Element.class.cast(L2.item(j)).getAttribute("href").substring(10);
profile.imgs.add(new URL(BASE,"/temp/"+href));
}
}
profiles.add(profile);
}
return profiles;
}
private static String toString(InputStream input)throws IOException
{
StringBuilder b=new StringBuilder();
Reader in=new InputStreamReader(input);
char array[]=new char[2048];
int nRead=0;
while((nRead=in.read(array))!=-1)
{
b.append(array, 0, nRead);
}
in.close();
return b.toString();
}
public static void main(String[] args)
{
try {
AmazoniaRobot app=new AmazoniaRobot();
if(args.length==0)
{
System.err.println("Gene Name missing");
return;
}
else if(args.length!=1)
{
System.err.println("Illegal number of arguments.");
return;
}
String geneName=args[0];
List<Profile> profiles=app.search(geneName);
Dimension dim=Toolkit.getDefaultToolkit().getScreenSize();
JPanel pane=new JPanel(new BorderLayout(5,5));
pane.setPreferredSize(new Dimension(
(int)(dim.width*0.8),
(int)(dim.height*0.8)
));
pane.setBorder(new EmptyBorder(5, 5, 5, 5));
pane.add(new JLabel(geneName,JLabel.LEFT),BorderLayout.NORTH);
JTabbedPane tabbed=new JTabbedPane();
pane.add(tabbed,BorderLayout.CENTER);
for(Profile profile: profiles)
{
for(URL url:profile.imgs)
{
JPanel pane2=new JPanel(new BorderLayout(5,5));
pane2.add(new JLabel(profile.name,JLabel.LEFT),BorderLayout.NORTH);
tabbed.addTab(profile.name, pane2);
Image img=Toolkit.getDefaultToolkit().createImage(url);
pane2.add(new JScrollPane(new JLabel(new ImageIcon(img))));
}
}
JOptionPane.showMessageDialog(null, pane);
}
catch (Exception e)
{
e.printStackTrace();
}
}
}

  • Line 84: we search for a gene name
  • 88: if there is a http redirection, the gene has been found
  • 96: the HTML page is downloaded
  • 100-112: fix the HTML to create a valid XML document
  • 133: transform the HTML page to a DOM document
  • 135-151: use XPATH to find the images and the labels
  • 189-211; put the data into a java/SWING Dialog

Compilation

javac AmazoniaRobot.java

Execution

java AmazoniaRobot EIF4G1
Et voilà:


That's it !

Pierre

2 comments:

Mikael said...

Thanks for this - very useful!

Unknown said...

tu es le meilleur ;-)