A Data Scraper for Amazonia (expression)
Today, we had a lecture about the "human induced pluripotent stem cells", presented by John De Vos. He introduced Amazonia, a free web atlas that allows an easy query of public human transcriptome data. Although there is no web service (REST/SOAP) to access this data, I was interested in getting some profiles of expression from this database as it is something I've failed to achieve with NCBI/GEO.
I wrote the following java scraper:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Author: | |
* Pierre Lindenbaum PhD | |
* Mail: | |
* plindenbaum@yahoo.fr | |
* WWW: | |
* http://plindenbaum.blogspot.com | |
* Motivation: | |
* A scraper for http://amazonia.transcriptome.eu | |
* Compilation: | |
* javac AmazoniaRobot | |
* Execution | |
* java AmazoniaRobot EIF4G1 | |
* | |
*/ | |
import java.awt.BorderLayout; | |
import java.awt.Dimension; | |
import java.awt.Image; | |
import java.awt.Toolkit; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.InputStreamReader; | |
import java.io.Reader; | |
import java.io.StringReader; | |
import java.net.HttpURLConnection; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.net.URLEncoder; | |
import java.util.ArrayList; | |
import java.util.HashSet; | |
import java.util.List; | |
import java.util.Set; | |
import javax.swing.ImageIcon; | |
import javax.swing.JLabel; | |
import javax.swing.JOptionPane; | |
import javax.swing.JPanel; | |
import javax.swing.JScrollPane; | |
import javax.swing.JTabbedPane; | |
import javax.swing.border.EmptyBorder; | |
import javax.xml.parsers.DocumentBuilder; | |
import javax.xml.parsers.DocumentBuilderFactory; | |
import javax.xml.xpath.XPath; | |
import javax.xml.xpath.XPathConstants; | |
import javax.xml.xpath.XPathFactory; | |
import org.w3c.dom.Attr; | |
import org.w3c.dom.Document; | |
import org.w3c.dom.Element; | |
import org.w3c.dom.Node; | |
import org.w3c.dom.NodeList; | |
import org.xml.sax.EntityResolver; | |
import org.xml.sax.InputSource; | |
import org.xml.sax.SAXException; | |
public class AmazoniaRobot | |
{ | |
private static final URL BASE; | |
static { | |
try | |
{ | |
BASE=new URL("http://amazonia.transcriptome.eu"); | |
} | |
catch(IOException err) | |
{ | |
throw new RuntimeException("Bad base URL"); | |
} | |
} | |
private static class Profile | |
{ | |
String name; | |
Set<URL> imgs=new HashSet<URL>(); | |
} | |
private List<Profile> search(String query) throws Exception | |
{ | |
List<Profile> profiles=new ArrayList<Profile>(); | |
URL url=new URL(BASE,"search.php?searchField=alias&id="+URLEncoder.encode(query, "UTF-8")); | |
HttpURLConnection con=(HttpURLConnection)url.openConnection(); | |
con.connect(); | |
con.setInstanceFollowRedirects(false); | |
String location=con.getHeaderField("Location"); | |
con.disconnect(); | |
if(location==null) return null; | |
url=new URL(BASE,location); | |
con=(HttpURLConnection)url.openConnection(); | |
con.connect(); | |
InputStream in=con.getInputStream(); | |
String html=toString(in); | |
in.close(); | |
con.disconnect(); | |
//fix xhtml | |
int n=-1; | |
while((n=html.indexOf("<meta",n+1))!=-1) | |
{ | |
int n2=html.indexOf('>',n+1); | |
if(n2==-1) break; | |
if(html.charAt(n2-1)!='/') | |
{ | |
html=html.substring(0,n2)+"/"+html.substring(n2); | |
} | |
n=n2; | |
} | |
html=html.replace("&", "&"); | |
DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance(); | |
factory.setCoalescing(true); | |
factory.setNamespaceAware(false); | |
factory.setExpandEntityReferences(true); | |
factory.setValidating(false); | |
factory.setIgnoringComments(true); | |
factory.setIgnoringElementContentWhitespace(true); | |
DocumentBuilder builder=factory.newDocumentBuilder(); | |
builder.setEntityResolver(new EntityResolver() | |
{ | |
@Override | |
public InputSource resolveEntity(String publicId, String systemId) | |
throws SAXException, IOException | |
{ | |
return new InputSource(new StringReader("")); | |
} | |
}); | |
Document dom=builder.parse(new InputSource(new StringReader(html))); | |
XPath xpath=XPathFactory.newInstance().newXPath(); | |
NodeList L=(NodeList)xpath.evaluate("//div[@class='fieldTitle']", dom, XPathConstants.NODESET); | |
for(int i=0;i< L.getLength();++i) | |
{ | |
Profile profile=new Profile(); | |
profile.name=L.item(i).getTextContent(); | |
for(Node n1=L.item(i).getNextSibling();n1!=null;n1=n1.getNextSibling()) | |
{ | |
NodeList L2=(NodeList)xpath.evaluate(".//a[starts-with(@href,'http:temp/histo_')]",n1, XPathConstants.NODESET); | |
for(int j=0;j< L2.getLength();++j) | |
{ | |
String href=Element.class.cast(L2.item(j)).getAttribute("href").substring(10); | |
profile.imgs.add(new URL(BASE,"/temp/"+href)); | |
} | |
} | |
profiles.add(profile); | |
} | |
return profiles; | |
} | |
private static String toString(InputStream input)throws IOException | |
{ | |
StringBuilder b=new StringBuilder(); | |
Reader in=new InputStreamReader(input); | |
char array[]=new char[2048]; | |
int nRead=0; | |
while((nRead=in.read(array))!=-1) | |
{ | |
b.append(array, 0, nRead); | |
} | |
in.close(); | |
return b.toString(); | |
} | |
public static void main(String[] args) | |
{ | |
try { | |
AmazoniaRobot app=new AmazoniaRobot(); | |
if(args.length==0) | |
{ | |
System.err.println("Gene Name missing"); | |
return; | |
} | |
else if(args.length!=1) | |
{ | |
System.err.println("Illegal number of arguments."); | |
return; | |
} | |
String geneName=args[0]; | |
List<Profile> profiles=app.search(geneName); | |
Dimension dim=Toolkit.getDefaultToolkit().getScreenSize(); | |
JPanel pane=new JPanel(new BorderLayout(5,5)); | |
pane.setPreferredSize(new Dimension( | |
(int)(dim.width*0.8), | |
(int)(dim.height*0.8) | |
)); | |
pane.setBorder(new EmptyBorder(5, 5, 5, 5)); | |
pane.add(new JLabel(geneName,JLabel.LEFT),BorderLayout.NORTH); | |
JTabbedPane tabbed=new JTabbedPane(); | |
pane.add(tabbed,BorderLayout.CENTER); | |
for(Profile profile: profiles) | |
{ | |
for(URL url:profile.imgs) | |
{ | |
JPanel pane2=new JPanel(new BorderLayout(5,5)); | |
pane2.add(new JLabel(profile.name,JLabel.LEFT),BorderLayout.NORTH); | |
tabbed.addTab(profile.name, pane2); | |
Image img=Toolkit.getDefaultToolkit().createImage(url); | |
pane2.add(new JScrollPane(new JLabel(new ImageIcon(img)))); | |
} | |
} | |
JOptionPane.showMessageDialog(null, pane); | |
} | |
catch (Exception e) | |
{ | |
e.printStackTrace(); | |
} | |
} | |
} |
- Line 84: we search for a gene name
- 88: if there is a http redirection, the gene has been found
- 96: the HTML page is downloaded
- 100-112: fix the HTML to create a valid XML document
- 133: transform the HTML page to a DOM document
- 135-151: use XPATH to find the images and the labels
- 189-211; put the data into a java/SWING Dialog
Compilation
javac AmazoniaRobot.java
Execution
java AmazoniaRobot EIF4G1
Et voilà:That's it !
Pierre
2 comments:
Thanks for this - very useful!
tu es le meilleur ;-)
Post a Comment