20 December 2007

My fNotebook: Apache Tomcat / Bioinformatics

Hi all,
here is how I installed created and installed today a web application based on JSP (Java Server Page) and running on tomcat.



A prior knowledge on how deploying a web application with tomcat is required so this post is more a notebook than a tutorial.

First download tomcat 6.0, extract it:
wget -q "http://apache.cict.fr/tomcat/tomcat-6/v6.0.14/bin/apache-tomcat-6.0.14.tar.gz"
tar xfz apache-tomcat-6.0.14.tar.gz


Fetch the mysql java connector, extract it, and move in into the tomcat 'lib' folder
wget -q "ftp://ftp.inria.fr/pub/MySQL/Downloads/Connector-J/mysql-connector-java-5.1.5.tar.gz"
tar xfz mysql-connector-java-5.1.5.tar.gz
mv mysql-connector-java-5.1.5/mysql-connector-java-5.1.5-bin.jar apache-tomcat-6.0.14/lib/


I need the java standard template library JSTL library. I fetch and extract it.
wget "http://people.apache.org/builds/jakarta-taglibs/nightly/projects/standard/jakarta-taglibs-standard-20060823.tar.gz"
tar xfz jakarta-taglibs-standard-20060823.tar.gz


I create a database of snp.
mysql -u root -p -D test -e 'create table snp(chrom varchar(10) ,chromStart int not null,chromEnd int not null,name varchar(20) unique not null)'

I fill this database with a few snp from dbsnp@ucsc
mysql -N --user=genome --host=genome-mysql.cse.ucsc.edu -A -D hg18 -e 'select chrom,chromStart,chromEnd,name from snp126 where chrom="chrM" ' |\
gawk -F ' ' '{printf("insert into test.snp(chrom,chromStart,chromEnd,name) values (\"%s\",%s,%s,\"%s\");\n",$1,$2,$3,$4);}' |\
mysql -u login -p


I add a mysql connection pool in apache. In apache-tomcat-6.0.14/conf/context.xml , I add the following code just before the last tag </Context>
<Resource name="jdbc/MYSQL" auth="Container" type="javax.sql.DataSource"
maxActive="100" maxIdle="30" maxWait="10000"
username="login" password="yourpassword" driverClassName="com.mysql.jdbc.Driver"
url="jdbc:mysql://localhost:3306/test?autoReconnect=true"/>


we also need to setup a few properties before running tomcat:

export JAVA_HOME /usr/your-path/java1.6
export CATALINA_HOME=${PWD}/apache-tomcat-6.0.14
export CATALINA_BASE=${PWD}/apache-tomcat-6.0.14


we can now run tomcat.
./apache-tomcat-6.0.14/bin/startup.sh
Using CATALINA_BASE: /home/pierre/tmp/TOMCAT/apache-tomcat-6.0.14
Using CATALINA_HOME: /home/pierre/tmp/TOMCAT/apache-tomcat-6.0.14
Using CATALINA_TMPDIR: /home/pierre/tmp/TOMCAT/apache-tomcat-6.0.14/temp
Using JRE_HOME: /usr/your-path/java1.6/jre


we then create a few new directories

mkdir -p ./src/jsp
mkdir -p ./src/org/lindenb/jsp


We create a first JSP Custom TAG in src/org/lindenb/jsp/Anchor2DbSNP.java. This custom JSP tag will be used to create a automatic anchor to dbSNP.
package org.lindenb.jsp;
import javax.servlet.jsp.*;
import javax.servlet.jsp.tagext.*;
import java.util.regex.*;

/**
* This is a simple printing a link to dbSNP.
*/
public class Anchor2DbSNP extends BodyTagSupport
{
static private final Pattern RS_PATTERN=Pattern.compile("rs[0-9]+");

public int doEndTag() throws JspException
{
try
{
BodyContent bodyContent= getBodyContent();
if(bodyContent==null) return EVAL_PAGE;
String input=bodyContent.getString().trim().toLowerCase();
if(RS_PATTERN.matcher(input).matches())
{
getPreviousOut().print(
"<a href='http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?rs="+
input.substring(2)+
"'>"+
input+
"</a>"
);
}
else
{
getPreviousOut().print(input);
}
} catch(java.io.IOException err)
{
throw new JspException(err);
}
return EVAL_PAGE;
}
}


Another two custom tags will be used to display a simple genomic map in SVG.

Here is src/org/lindenb/jsp/ChromosomeTag.java

package org.lindenb.jsp;
import javax.servlet.jsp.*;
import javax.servlet.jsp.tagext.*;
import java.util.regex.*;
import java.util.*;

public class ChromosomeTag extends BodyTagSupport
{
private static class Position
{
int position=0;
String name=null;
public Position(int position,String name)
{
this.position=position;
this.name=name;
}
}

private Vector<Position> items= null;
private int svgWidth=500;
private int itemHeight=20;


public int doStartTag() throws JspException
{
items= new Vector<Position>();
return EVAL_BODY_INCLUDE;
}

public void addPosition(int position,String name)
{
if(position<0 || name==null) return;
this.items.addElement(new Position(position,name));
}

public int doEndTag() throws JspException
{
//if(this.items.isEmpty()) return EVAL_PAGE;
int max=0;
int min=Integer.MAX_VALUE;
for(Position p:this.items)
{
max=Math.max(p.position,max);
min=Math.min(p.position,min);
}
try
{
JspWriter out= pageContext.getOut();
out.write("<svg xmlns:xlink='http://www.w3.org/1999/xlink' xmlns='http://www.w3.org/2000/svg' width='"+svgWidth+"' height='"+ (this.items.size()*itemHeight)+"' style='font-size:"+(itemHeight-10)+"pt;stroke-width:1;'>");
out.write("<rect x='0' y='0' width='"+svgWidth+"' height='"+ (this.items.size()*itemHeight)+"' style='fill:white; stroke:gray;'/>");
int y=0;
for(Position p:this.items)
{
int x= (int)(((p.position-min)/(float)(max-min))*(svgWidth-200))+100;
out.write("<line x1='"+x+"' y1='"+y+"' x2='"+x+"' y2='"+(y+itemHeight)+"' style='stroke:blue;'/>");

out.write("<line x1='0' y1='"+y+"' x2='"+svgWidth+"' y2='"+(y)+"' style='stroke:gray;'/>");

out.write("<text x='"+(x+4)+"' y='"+(y+5+itemHeight/2)+"' >"+p.name+"</text>");
y+=itemHeight;
}

out.write("</svg>");

}
catch(java.io.IOException err)
{
throw new JspException(err);
}
items=null;
return EVAL_PAGE;
}

public void release()
{
items=null;
}
}


and

src/org/lindenb/jsp/ChromItemTag.java
package org.lindenb.jsp;
import javax.servlet.jsp.*;
import javax.servlet.jsp.tagext.*;
import java.util.regex.*;


public class ChromItemTag extends BodyTagSupport
{
private int position=-1;
private String name="";

public void setPosition(int position) { this.position= position;}


public int doEndTag() throws JspException
{
BodyContent bodyContent= getBodyContent();
if(bodyContent!=null) this.name =bodyContent.getString().trim();
if(this.name==null || name.length()==0) this.name=String.valueOf(this.position);
Tag parent= findAncestorWithClass(this,ChromosomeTag.class);
if(parent==null) return EVAL_PAGE;
ChromosomeTag ct= ChromosomeTag.class.cast(parent);
ct.addPosition(this.position+20,this.name);
return EVAL_PAGE;
}

public void release()
{
name=null;
position=-1;
}
}



the file src/bio.tld is the file used to declare the three custom tags.

<?xml version="1.0" encoding="ISO-8859-1" ?>
<taglib xmlns="http://java.sun.com/xml/ns/j2ee"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://java.sun.com/xml/ns/j2ee http://java.sun.com/xml/ns/j2ee/web-jsptaglibrary_2_0.xsd"
version="2.0">

<description>Bioinfo JSP TAG library</description>
<display-name>Bioinfo</display-name>
<tlib-version>1.1</tlib-version>
<short-name>bio</short-name>
<uri>http://jsp.lindenb.org</uri>

<tag>
<name>rs</name>
<tag-class>org.lindenb.jsp.Anchor2DbSNP</tag-class>
<body-content>JSP</body-content>
<info>display a link to dbSNP</info>
</tag>

<tag>
<name>chrom</name>
<tag-class>org.lindenb.jsp.ChromosomeTag</tag-class>
<body-content>JSP</body-content>
<info>svg map</info>
</tag>

<tag>
<name>item</name>
<tag-class>org.lindenb.jsp.ChromItemTag</tag-class>
<body-content>JSP</body-content>
<info>svg item</info>
<attribute>
<name>position</name>
<required>true</required>
<rtexprvalue>true</rtexprvalue>
</attribute>
</tag>


</taglib>



the file cat src/jsp/page.jsp is our JSP. It displays a SVG map and a table of a few SNP. It uses the JSTL and our custom tags.
<jsp:root
xmlns:jsp="http://java.sun.com/JSP/Page"
xmlns:c="http://java.sun.com/jsp/jstl/core"
xmlns:sql="http://java.sun.com/jsp/jstl/sql"
xmlns:bio="http://jsp.lindenb.org"

version="2.0">
<jsp:directive.page contentType="text/xml; charset=iso-8859-1"/>
<jsp:output doctype-root-element="html"
doctype-public="-//W3C//DTD XHTML 1.0 Strict//EN"
doctype-system="http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
omit-xml-declaration="true"
/>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>JSP Tutorial For Bioinformatics</title>
<!-- <meta http-equiv="Content-Type" content="application/xhtml+xml; charset=iso-8859-1" /> -->
</head>
<body>
<sql:query var="snps" dataSource="jdbc/MYSQL">select * from snp limit 10</sql:query>
<bio:chrom>
<c:forEach var="row" items="${snps.rows}">

<bio:item position="${row.chromStart}"><c:out value="${row.name}"/></bio:item>
</c:forEach>
</bio:chrom>

<sql:query var="snps" dataSource="jdbc/MYSQL">select * from snp limit 10</sql:query>
<table>
<tr><th>Position</th><th>Name</th></tr>
<c:forEach var="row" items="${snps.rows}">
<tr>
<td><c:out value="${row.chrom}"/>:<c:out value="${row.chromStart}"/>-<c:out value="${row.chromEnd}"/></td>
<td><bio:rs><c:out value="${row.name}"/></bio:rs></td>
</tr>
</c:forEach>
</table>
</body>
</html>
</jsp:root>


Tomcat needs src/web.xml as a descriptor to learn how to deploy this web application.

<?xml version="1.0" encoding="ISO-8859-1"?>
<web-app
xmlns="http://java.sun.com/xml/ns/javaee"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd"
version="2.5">
<display-name>Application Name</display-name>
<description>Application Description</description>


<taglib>
<taglib-uri>http://jsp.lindenb.org</taglib-uri>
<taglib-location>/WEB-INF/bio.tld</taglib-location>
</taglib>

<!-- see http://www.developer.com/java/ejb/article.php/1447551 -->
<taglib>
<taglib-uri>http://java.sun.com/jstl/fmt</taglib-uri>
<taglib-location>/WEB-INF/fmt.tld</taglib-location>
</taglib>

<taglib>
<taglib-uri>http://java.sun.com/jstl/fmt-rt</taglib-uri>
<taglib-location>/WEB-INF/fmt-rt.tld</taglib-location>
</taglib>

<taglib>
<taglib-uri>http://java.sun.com/jstl/core</taglib-uri>
<taglib-location>/WEB-INF/c.tld</taglib-location>
</taglib>

<taglib>
<taglib-uri>http://java.sun.com/jstl/core-rt</taglib-uri>
<taglib-location>/WEB-INF/c-rt.tld</taglib-location>
</taglib>

<taglib>
<taglib-uri>http://java.sun.com/jstl/sql</taglib-uri>
<taglib-location>/WEB-INF/sql.tld</taglib-location>
</taglib>

<taglib>
<taglib-uri>http://java.sun.com/jstl/sql-rt</taglib-uri>
<taglib-location>/WEB-INF/sql-rt.tld</taglib-location>
</taglib>

<taglib>
<taglib-uri>http://java.sun.com/jstl/x</taglib-uri>
<taglib-location>/WEB-INF/x.tld</taglib-location>
</taglib>

<taglib>
<taglib-uri>http://java.sun.com/jstl/x-rt</taglib-uri>
<taglib-location>/WEB-INF/x-rt.tld</taglib-location>
</taglib>


</web-app>


and we finally need src/build.xml to build all this stuff with ant.
<?xml version="1.0" encoding="ISO-8859-1"?>
<project name="Test" default="install" basedir=".">
<property name="tomcat.home" value="../apache-tomcat-6.0.14"/>
<property name="jstl.home" value="../jakarta-taglibs/standard"/>
<property name="webapps" value="${tomcat.home}/webapps"/>

<target name="compile">
<javac destdir="." srcdir="." debug="on">
<include name="org/lindenb/jsp/*.java"/>
<classpath>
<pathelement location="${jstl.home}/lib/jstl.jar"/>
<pathelement location="${jstl.home}/lib/standard.jar"/>
<pathelement location="${tomcat.home}/lib/servlet-api.jar"/>
<pathelement location="${tomcat.home}/lib/jsp-api.jar"/>
</classpath>
</javac>

<jar destfile="bio.jar"
basedir="."
includes="org/**"

/>

</target>

<target name="install" depends="compile">
<!-- yes, I know there is also war task... -->
<zip destfile="${webapps}/test.war">
<zipfileset dir="jsp" includes="*.jsp"/>
<zipfileset dir="." includes="web.xml" prefix="WEB-INF"/>
<zipfileset dir="${jstl.home}/tld" includes="*.tld" prefix="WEB-INF"/>
<zipfileset dir="." includes="*.tld" prefix="WEB-INF"/>
<zipfileset dir="${jstl.home}/lib" includes="*.jar" prefix="WEB-INF/lib"/>
<zipfileset dir="." includes="bio.jar" prefix="WEB-INF/lib"/>
</zip>
</target>

</project>


let's build this application. It creates a web archive (war) in the webapps folder of tomcat.
ant
Buildfile: build.xml

compile:
[jar] Building jar: /home/pierre/tmp/TOMCAT/src/bio.jar

install:
[zip] Building zip: /home/pierre/tmp/TOMCAT/apache-tomcat-6.0.14/webapps/test.war

BUILD SUCCESSFUL
Total time: 1 second


When you open "http://localhost:8080/test/page.jsp" you should get the following screen:



Pierre

17 December 2007

ShiftHappens

A great presentation about the future of eductation (somehow disheartening...)





see also: http://shifthappens.wikispaces.com/

13 December 2007

Embedded "ManyEyes" interactive visualization

Today ManyEyes launched the ability to embed an interactive visualization into your own blog, personal webpage or any other page you think makes sense: see http://blog.many-eyes.com/2007/12/12/embeddable-visualizations-have-arrived/.



Pierre

12 December 2007

IBD Status applet

I've just released an applet called IBDStatus. This applet (java 6 is required) is freely available at:




This applet takes as input the breakpoint analysis data (Nature. Dib et al.(1996); 380:152-154) from the 'Fondation Jean-Dausset' (CEPH) and display the Identical By Descent (IBD) regions between a pair of related individuals. Two people share an allele identical by descent if the two copies of the allele were inherited from a common ancestor. A pair of siblings can share 0, 1, or 2 alleles:
  • 0: not the same alleles

  • 1: only one allele in common

  • 2: both same alleles





Picture from Abel & Dessein


As an example, this IBD status can be used to design the controls of a CGH assay.







  • Top left pane: a linkage table with genotype=f(individual,marker)

  • Middle left pane: the list of individual: using the Ctrl-key select
    two related
    individuals an press the Add sib button. Your new pair is added in
    the bottom left table.

  • Bottom left pane: the list of sib-pairs: for each pair, the IBD status
    is displayed in the right table


  • Right table:

    • Marker index

    • chromosome

    • STS D-Number

    • Start-position (build 36)

    • End-position (build 36)

    • IBD status of each sib-pair(if any)

    • Count IBD with unknown status

    • Count IBD. 0


    • Count IBD. 1

    • Count IBD. 2






I wrote this software a few monthes ago but it was not much used, so I
was given the permission to release this version to the community.
Enjoy.

Pierre

06 December 2007

Google Chart API Launched

Today Google the Google Chart API a simple URL based tool for creating charts and graphs for websites.

For example the following url:


http://chart.apis.google.com/chart?
chco=ff0000,00ff00,0000ff /* colors */
&cht=p3 /* Chart Type= Pie */
&chd=t:1,2,3,4 /* Values */
&chs=400x200 /* Dimension */
&chl=Nature|PNAS|Science|EMBO|Virology /* LABELS */


will display this image:




Pierre

02 December 2007

PIvot

FYI: I put 'pivot' in the project hosted at http://code.google.com/ where I put some of my (open) source codes. Pivot is a java command-line tool, it digests a tabular source and prints a summary of the data. See the wiki page for more information http://code.google.com/p/lindenb/wiki/Pivot.

Pierre

01 December 2007

Women in Science 2

Maud

Maud came from Genethon where she took part of the creation of the first genetic map of the human genome. After several years at the National Center Of Genotyping, she became our engineer responsible for the production of the genotypes with the Illumina plateform at Integragen (this technology is the same as the one used by 23andMe).

Pierre

Women in Science 1

AS I said after Scifoo 2007, I should take some time to draw. So here it is...

Christine K.

A portrait of my former colleague Chistine, a biostatistician, who was in charge of the analysis of the genotypes produced at Integragen. She now works in an Inserm unit with Dr Florence Demenais on genetic epidemiology and multifactorial diseases.


Pierre

07 November 2007

JOBS 20071106

Programmer to install and maintain genome database software:
http://www2.recruitingcenter.net/clients/CalTech/publicjobs/controller.cfm?jbaction=JobProfile&Job_Id=13989&esid=az

Bioinformatician/Biostatistician position at the Centre for Molecular Oncology, Institute of Cancer
Immediate opening for a highly motivated Bioinformatician/Biostatistician with strong numerical skills and a track record of the analysis of high throughput technologies.
http://webapps.qmul.ac.uk/hr/vacancies/jobs.php?c=0&format=full

05 November 2007

Job@Novartis 20071105

Position Title Research Investigator II/Bioinformatics
Work Location Switzerland - Basel
Company/Legal Entity Switzerland Novartis Pharma AG, Basel
Functional Area Research
Job Type Full Time
https://xjobs.brassring.com/2057/asp/tg/cim_jobdetail.asp?jobId=602151&PartnerId=13617&SiteId=5049&type=mail&JobReqLang=140&recordstart=1&JobSiteId=5049&JobSiteInfo=602151_5049&gqid=0

02 November 2007

job@swissprot 2007-11-02

The Swiss-Prot group at the Swiss Institute of Bioinformatics (SIB) in Geneva (Switzerland) is looking for a Java developer.

See http://www.isb-sib.ch/infos/careers_070625.htm for more details.

OpenSocial, a google API for social networks, is alive

OpenSocial, the new Google API is alive at : http://code.google.com/apis/opensocial/.

OpenSocial provides a common set of APIs for social applications across multiple websites. With standard JavaScript and HTML, developers can create apps that access a social network's friends and update feeds.

Common APIs mean you have less to learn to build for multiple websites. OpenSocial is currently being developed by Google in conjunction with members of the web community. The ultimate goal is for any social website to be able to implement the APIs and host 3rd party social applications. There are many websites implementing OpenSocial, including Engage.com, Friendster, hi5, Hyves, imeem, LinkedIn, MySpace, Ning, Oracle, orkut, Plaxo, Salesforce.com, Six Apart, Tianji, Viadeo, and XING.
In order for developers to get started immediately, Orkut has opened a limited sandbox that you can use to start building apps using the OpenSocial APIs.

01 November 2007

CompilationTask

The C preprocessor contains some predefined macros that can be used to identify the date when your pogram was compiled:

(...)
printf("%s Compiled on %s at %s.\n",argv[0],__DATE__,__TIME__);
(...)


Java has no preprocessor, and is missing this kind of information: I wrote a custom ant task generating a java file called Compilation.java and containing all the needed informations.


Compilation:
<taskdef name="compileInfoTask"
classname="org.lindenb.ant.CompileInfoTask"
classpath="build/ant"/>
(...)
<compileInfoTask name="Pubmed2Wikipedia" package="org.lindenb.util" dir="build/compile"/>




Result:


package org.lindenb.util;
import java.util.GregorianCalendar;
public class Compilation
{
private Compilation() {}
public static String getName() { return "Pubmed2Wikipedia";}
public static String getPath() { return "~/lindenb";}
public static String getUser() { return "pierre";}
public static GregorianCalendar getCalendar() { return new GregorianCalendar(2007,10,1,22,30,11);}
public static String getDate() {return "2007-11-01 at 22:30:11"; }
public static String getLabel() { return (getName()==null?"":getName()+" : ")+"Compiled by "+getUser()+" on "+getDate()+" in "+getPath();}
public static void main(String args[]) { System.out.println(getLabel());}
}


The source code is available here:

http://lindenb.googlecode.com/svn/trunk/src/java/org/lindenb/ant/CompileInfoTask.java


Pierre

Ant custom tasks

Ant, the java Make can be extended by creating your own custom task. I had fun today by creating a new Ant Task called SplashTask. It generates an new logo on the fly to be used as a java splashScreen.

Declaration:


(...) <taskdef name="makeSplash"
classname="org.lindenb.ant.SplashTask"
classpath="build/ant"/>
(...)
<target name="splash" depends="compile-ant-tasks">
<makeSplash title="Hello World !" file="task.jpeg"/>
</target>
(...)


Usage:

pierre@linux:~/lindenb> ant x
Buildfile: build.xml

compile-ant-tasks:

splash:
[makeSplash] Saved SplashScreen "Hello World !" to task.jpeg[349 x 85]



task

The source code is available at http://lindenb.googlecode.com/svn/trunk/src/java/org/lindenb/ant/SplashTask.java


Pierre

Pubmed2Wikipedia

I've created a java tool called pubmed2wikipedia: I wrote it to quickly create a new entry for wikipedia.
First, the user select a set of articles about a given subject from pubmed, the software then download, prepare and format the data for a new wikipedia page. For example it creates the 'references' part and suggest the Categories: from the Mesh terms. I've also included a dictionary which recognize some regex patterns to help create a wikipedia internal link.
I first tried to use my own tool to create an entry about NSP3, a viral protein I studied during my PhD but with hundred of articles I felt I was not any more an expert about this protein :-) so I created a small article about another protein: RoXaN.

I hosted this tool on http://code.google.com. It is available at: http://lindenb.googlecode.com/files/pubmed2wikipedia.jar

Pierre

26 October 2007

“Getting Started In…”

In PLOS, today: This month, PLoS Computational Biology and the ISCB begin a series of short, practical articles for students and active researchers who want to learn more about new areas of computational biology and are unsure where or how to start. The aim of each article in the “Getting Started in…” series is to introduce the essentials: define the area and what it is about, highlight the debates and issues of relevance, and provide directions to the most relevant books, articles, or Web sites to find out more...

The first expert to inform, motivate, and inspire readers to consider a new direction is Dr. Xiaole Shirley Liu, who introduces tiling microarrays.

“Getting Started In…”: A Series Not to Miss: PLoS Computational Biology 3 (10), e224 (2007)
Getting Started in Tiling Microarray Analysis : PLoS Computational Biology 3 (10), e183 (2007)

18 October 2007

Information R/evolution

A beautiful video about the science of information...

03 October 2007

Publish or Perish

FYI: Found today but not tested:


Publish or Perish - A citation analysis software program, designed to help individual academics to present their case for research impact to its best advantage.



Publish or Perish is a software program that retrieves and analyzes academic citations. It uses Google Scholar to obtain the raw citations, then analyzes these and presents the following statistics:




  • Total number of papers

  • Total number of citations

  • Average number of citations per paper

  • Average number of citations per author

  • Average number of papers per author

  • Average number of citations per year

  • Hirsch's h-index and related parameters

  • Egghe's g-index

  • The contemporary h-index

  • The age-weighted citation rate

  • Two variations of individual h-indices

  • An analysis of the number of authors per paper.

25 September 2007

A mysql user defined function to get the reverse complement of a sequence

I was asked today to compare some experimental genotypes to the
theoretical data. Unfortunaly many genotypes were given on the
anti-parallele strand compared to their references. So I wrote a small
href="http://dev.mysql.com/doc/refman/5.0/en/adding-functions.html">mysql
user defined function (UDF)
to implement a new function
called 'revcomp' in mysql used to return the reverse complement of
a DNA sequence. Written in 'C/C++', this kind of function can be used
to embed bioinformatics into mysql. The coding was straighforward as I
already wrote a UDF translating a cDNA to a proteic sequence in a href="http://plindenbaum.blogspot.com/2006/07/mysql-user-defined-function-udf-for.html">previous
post
.

Here is the code.


#include <my_global.h>
#include <m_ctype.h>
#include <mysql.h>

#include <m_string.h>
/* this function is called by mysql to initialize it */
my_bool revcomp_init(UDF_INIT *initid, UDF_ARGS *args, char *message);
/* this function is called by mysql to dispose it */
void revcomp_deinit(UDF_INIT *initid);
/* the main function with get the reverse complement of a dna */
char *revcomp(UDF_INIT *initid, UDF_ARGS *args, char *result,
unsigned long *length, char *is_null, char *error);

/* a trivial function returning the complementary base of an acid nucleic */
static char complement(char b)
{
switch(b)
{
case 'A': return 'T';
case 'T': return 'A';
case 'G': return 'C';
case 'C': return 'G';

case 'a': return 't';
case 't': return 'a';
case 'g': return 'c';
case 'c': return 'g';

case 'w': return 'w';
case 'W': return 'W';

case 's': return 's';
case 'S': return 'S';

case 'y': return 'r';
case 'Y': return 'R';

case 'r': return 'y';
case 'R': return 'Y';

case 'k': return 'm';
case 'K': return 'M';

case 'm': return 'k';
case 'M': return 'K';

case 'b': return 'v';
case 'd': return 'h';
case 'h': return 'd';
case 'v': return 'b';


case 'B': return 'V';
case 'D': return 'H';
case 'H': return 'D';
case 'V': return 'B';

case 'N': return 'N';
case 'n': return 'n';

}
return '?';
}


/** this function is called by mysql to initialize our revcomp function */
my_bool revcomp_init(
UDF_INIT *initid,
UDF_ARGS *args,
char *message
)
{
/* check we have one STRING argument */
if (!(args->arg_count == 1 && args->arg_type[0] == STRING_RESULT ))
{
strncpy(message,"Bad parameter, expected a DNA",MYSQL_ERRMSG_SIZE);
return 1;
}
initid->maybe_null=1;
/* initid->ptr will be used to store the transformed sequence */
initid->ptr= (char*)malloc(0);
/* out of memory ? */
if(initid->ptr==NULL)
{
strncpy(message,"Out Of Memory",MYSQL_ERRMSG_SIZE);
return 1;
}
return 0;
}

/** this function is called by mysql to dispose our revcomp function */
void revcomp_deinit(UDF_INIT *initid)
{
/* free the user ptr */
if(initid->ptr!=NULL) free(initid->ptr);
}

/** this is the function called by mysql to reverse-complement a DNA */
char *revcomp(UDF_INIT *initid, UDF_ARGS *args, char *result,
unsigned long *length, char *is_null, char *error)
{
long i;
/* the size of the input */
long size= args->lengths[0];
/* the DNA given as input */
const char *dna=args->args[0];
char *ptr=NULL;

if (dna==NULL) // DNA is a null argument
{
*is_null=1;
return NULL;
}
/* the length of the returned string will be 'size' */
*length=size;


/** try to reallocate our memory to store the new transformed DNA sequence */
ptr= (char*)realloc(initid->ptr,sizeof(char)*(size));//no need (size+1)

/* out of memory ? */
if(ptr==NULL)
{
*is_null=1;
*error=1;
strncpy(error,"Out Of Memory",MYSQL_ERRMSG_SIZE);
return NULL;
}
initid->ptr=ptr;
*is_null=0;
*error=0;

/* build the reverse complement */
for(i=0;i< size;++i)
{
initid->ptr[i] = complement( dna[(size-1)-i] );
}

/* return our pointer */
return initid->ptr;
}



And here is the Makefile for my machine...


/usr/lib/revcomp.so:revcomp.c
gcc -fPIC -shared -I/usr/include/mysql -DDBUG_OFF -O3 -lmysqlclient -o $@ $<




The function was installed on mysql using the following statement:
>mysql create function revcomp returns string SONAME "revcomp.so";


TESTS:


mysql> select revcomp("GAATTC");
+-------------------+
| revcomp("GAATTC") |
+-------------------+
| GAATTC |
+-------------------+
1 row in set (0,00 sec)

mysql> select revcomp(revcomp("AAATTTaaatttGC"));
+------------------------------------+
| revcomp(revcomp("AAATTTaaatttGC")) |
+------------------------------------+
| AAATTTaaatttGC |
+------------------------------------+
1 row in set (0,00 sec)

mysql> select revcomp("SWatgcatgAAATTTaaatttGC");
+------------------------------------+
| revcomp("SWatgcatgAAATTTaaatttGC") |
+------------------------------------+
| GCaaatttAAATTTcatgcatWS |
+------------------------------------+
1 row in set (0,00 sec)



here I create a table of primers and I find the all sub-sequences that could be amplified.


mysql> create table primers(id int primary key auto_increment, primer
varchar(100));
Query OK, 0 rows affected (0,02 sec)

mysql> desc primers;
+--------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+--------+--------------+------+-----+---------+----------------+
| id | int(11) | | PRI | NULL | auto_increment |
| primer | varchar(100) | YES | | NULL | |
+--------+--------------+------+-----+---------+----------------+
2 rows in set (0,00 sec)


mysql> insert into primers(primer) values
("CAAAATGAAACAGGT"),
("TAATCAAATAATGCCTGGATTTCTT"),
("TGAACATCCGTCCTCTCCCCACAAA"),
("TCATAGTCCTGAGGAAAGAGAA"),
("TCAAACAAGGAAAATGGAAAACAAATTCA"),
("TCACTGCTGGATGTGTGGGAAAAACTGCA"),
("CACTGTTGCAGTTTTTCCCACAC"),
("TCAGGACTATGAGCAAAGGAACA"),
("TACCAAGTACCTGCGCTCCAGGTACATTT"),
("AGATACCTGTTTCATTT"),
("CTTTTCAGTGGTTGATGCTCAAGATG"),
("GCCTGACGAAGTTAAAACTGATATTGA")
;

Query OK, 12 rows affected (0,00 sec)
Records: 12 Duplicates: 0 Warnings: 0


mysql> set
@seq:="CACTGCTTCTCACTGTTGCAGTTTTTCCCACACATCCAGCAGTGAAAGTCCACTGTAACTTCAGCATAAT
CTGTTGGCATGTGAATTTGTTTTCCATTTTCCTTGTTTGACTGACTGGCTATGTCTTCACTTTTTTCATTTTTTTTG
ACTCTTGAGCCATAGTTCGTAAAATTGCTCCATATCTTGTATCCCATTCTCCTTCATGTAAGTCCAAACTTCTCTTT
CCTCAGGACTATGAGCAAAGGAACAGTTTCCAACATATTGACATTTTTTCCCAGAAGCAATATGGTTGCACAGATCA
AACTGTAAAGGCATTTGTTTCTTTGTGGGGAGAGGACGGATGTTCATCCACTTCTTACGTTCAATAGACATCACTCT
CATCGCACGCCGGTCTTTGGTCCACGAATGCCTTGCTTTTGCACTACAATATTTTCTGTTTTTGTCTGGTTCAATGA
CTTGACCGTTTCTCAGACACTGGGCGCACACAAACTTTATCTTCATATTAAGAAATCCAGGCATTATTTGATTACCA
AGTACCTGCGCTCCAGGTACATTTGCTTCCAAATTCTGCCAATATCGTTTAGACTCTTGAGCAATAGCATCATGTGA
GATACCTGTTTCATTTTGCCG";
Query OK, 0 rows affected (0,00 sec)

mysql> select
concat("ID.",Forward.id) as "Forward" ,
locate(Forward.primer,@seq) as "Start",
concat("ID.",Revers.id) as "Reverse",
locate(revcomp(Revers.primer),@seq)+length(Revers.primer) as "End",
repeat('*',(locate(revcomp(Revers.primer),@seq)+length(Revers.primer)-locate(Forward.primer,@seq) )/16) as "schema"
from
primers as Forward,
primers as Revers
where
locate(Forward.primer,@seq)>0 and
locate(revcomp(Revers.primer),@seq) > locate(Forward.primer,@seq)
group by 1,3 order by 5,2;

+---------+-------+---------+------+----------------------------------------+
| Forward | Start | Reverse | End | schema |
+---------+-------+---------+------+----------------------------------------+
| ID.10 | 609 | ID.1 | 628 | * |
| ID.7 | 11 | ID.6 | 46 | ** |
| ID.7 | 11 | ID.5 | 111 | ****** |
| ID.9 | 528 | ID.1 | 628 | ****** |
| ID.8 | 227 | ID.3 | 348 | ******* |
| ID.7 | 11 | ID.4 | 239 | ************** |
| ID.8 | 227 | ID.2 | 530 | ****************** |
| ID.7 | 11 | ID.3 | 348 | ********************* |
| ID.8 | 227 | ID.1 | 628 | ************************* |
| ID.7 | 11 | ID.2 | 530 | ******************************** |
| ID.7 | 11 | ID.1 | 628 | ************************************** |
+---------+-------+---------+------+----------------------------------------+
11 rows in set (0,00 sec)


That's it.

Pierre

Idiographica

Via aziesel on del.icio.us, I found the tool I was looking for:


Idiographica: a general-purpose web application to build idiograms on-demand for human, mouse and rat

Kin T and Ono Y, Bioinformatics 2007; doi:10.1093/bioinformatics/btm455



Idiographica a web server which serves as a general purpose idiogram rendering service, and allows users to generate high-quality idiograms with custom annotation according to their own genome-wide mapping/annotation data through an easy-to-use interface. The generated idiograms are suitable not only for visualizing summaries of genome-wide analysis but also for many types of presentation material including web pages, conference posters, oral presentations, etc.Idiographica is freely available at http://www.ncrna.org/idiographica/

24 September 2007

Sketchcast

Sketchcasting is a tool to communicate something online by recording a sketch, optionally with your voice speaking. Any sketch can then be embedded on your blog/ homepage for people to play-back, and you can also point people to your sketchcast channel here.


Here is my very first attempt to draw with this tool (please don't flame :-) ) where I tried to explain the technology GenomeHip used in my company to find the region(s) involved in a genetci disease. The approach relies on isolation of identical-by-descent regions from relative-pairs sharing the same disease.




Added Later: http://www.imaginationcubed.com/LaunchPage is far more powerful but you cannot blog your drawings.

Pierre

23 September 2007

google view:timeline

seen on Timeline and map views: here you can see the results of your query on a timeline or a map. With the timeline and map views, Google’s technology extracts key dates and locations from select search results so you can view the information in a different dimension.

See Charles Darwin's Timeline
Charles Darwin's Map
Bioinformatics conferences



Pierre

11 September 2007

NCBI Resource Locator

Via the public-semweb-lifesci mailing list:


"The NCBI Resource Locator provides stable, uniform addressing for NCBI content, making it easy to link to individual records. Some NCBI resources also provide services (like search) through these URLs."

http://view.ncbi.nlm.nih.gov/



How does it work?
Each URL has the form

http://view.ncbi.nlm.nih.gov/<noun>/<verb>/<expression>

Where:

  • <noun> is an NCBI resource (e.g., pubmed, gene, nucleotide, etc.)

  • <verb> is the action to perform (e.g., search, get,etc.). If <verb> is missing, the default verb "get" is used.

  • <expression> is data used by the action to perform the request


Some examples:


Note: but I guess this kind of REST URL doesn't allow to specify the output format (XML, ASN1, etc...)

06 September 2007

IBM CoScripter: A system for capturing, sharing, and automating tasks on the Web.

Via O'Reilly Radar:

CoScripter is firefox extension created by IBM. It is a system for recording, automating, and sharing processes performed in a web browser such as printing photos online, requesting a vacation hold for postal mail, or checking bank account information. Instructions for processes are recorded and stored in easy-to-read text here on the CoScripter web site, so anyone can make use of them.

02 September 2007

Google Earth Sky and Flight Simulator

Via: Transnet.

The newest version of GoogleEarth contains the new "Google Sky" but it also contains a hidden Fligh Simulator !!!! Press Ctrl-Alt-A under Linux.

Procrastination about social Networks during my vacation

"All the nerds have an account on Facebook, what about you ?"...
Ok I got one...

"All the geeks have an account on Twitter, what about you ?..."
Ok, I got one...

etc...

My favorite social network remains LinkedIn but I'm starting on being bored about all those social networks just because I cannot re-create another network elsewhere by sending again and again some invitations to my friends/contacts: they will definitely hate me :-). I don't remember where I read "I don't need a social network, the internet IS the network" but this could be true if anyone could host is own profile(foaf ? xml ?) in a file somewhere on the internet and a software could use it.

Some unordered ideas about this:

- use a kind of FOAF more structured/simple than RDF because I don't want anyone else to add a RDF statement about me.
- create a format for individuals, groups, jobs
- search by keywords ,dc:subject, location, degrees of separation
- cache the xml files in a cache on my computer (flat files ? javadb ?)
- implement a graphical tool and/or a command line tool
- trusted relations are bidirectional: he knows my mail (SHA1), I know his mail
- link to a picture
- location can be geo:lat/geo:long and/or name of country
- describe relationships (kind of relation, since...) instead of using foaf:knows
- include metadata about how long the file should be cached
- transform into rss to track the modifications.

I did it again

Okay, In a previous post I said that "I would not spam the Nature Network with the data collected from the NCBI" using the batch invitations, but this was before it was possible to add a custom message with the single invitations. The desire to test the method was too strong and I sent more than 2700 personalized invitations ("....as you published an article titled xxxx in 2003 in Bioinformatics...") to join bioformatics group. The number of members jumped to more than 300 persons in 3 weeks.... Corie then asked me kindly but strongly to "not do to this again...". Oups.... :-)

By the way, I also created a group called History Of Science where I asked for a structured source of data about History.

Pierre

No more bees

"If the bee disappears from the surface of the earth, man would have no more than four years to live. No more bees, no more pollination ... no more men!" [A Einstein]

A recent article of the French Newspaper "Le Monde" titled "The bees sick of the man" it is said: ...In the USA, where it is described as the “syndrome of collapse of the colonies”, about 25% of the livestock would have disappeared during the winter 2006-2007. In Europe, France, Belgium, Italy, Germany, Switzerland, Spain, Greece, Poland, the Netherlands were touched since the beginning of the years 2000. The losses can reach, locally, up to 90% of the colonies.

For pessimistics, see also: We're all going to die on Nature Scintilla.

Back from Holidays...

I'm now back from three weeks of holidays: there were a thousand posts in my "google-reader" so I just pressed the infamous button "Mark all as read". Sorry if I missed something really terrific :-)

Pierre

10 August 2007

Swiss-Prot 20 Talks on Google Video

Eric Jain has published the videos of the talks that happened during the "20th Anniversary of Swiss Prot": see http://eric.jain.name/2007/08/10/swiss-prot-20-talks-on-google-video/

01 August 2007

Scifoo J-1

31 July 2007

X:Map, a Genome Browser

Tim Yates is one of the latest member who joined the bioinformatics group on 'Nature Network'. Dr Yates works as a Research Programmer at the Paterson Institute for Cancer Research. On his web page is introduced X:MAP: an interactive, real-time scrollable, genome browser that shows the location of individual exon probes with respect to their target genes, transcripts and exons.

X:Map is a genome browser (http://xmap.picr.man.ac.uk/) which uses the google map API and the data from Ensembl. The result is really neat.

see also: AJAXification of genome browsers on NN.

19 July 2007

Scifoo 07: anxiety from a homebody

I'm arriving at SFO 22H00 on the 2nd and to SFO on the 6th 10H15.

























See you there ! :-)


Pierre

Seven deadly sins of bioinformatics

Via NodalPoint.
Keynote talk from Carole Goble at BOSC SIG from ISMB 2007 in Vienna, July 2007.

17 July 2007

Inside LSID

This post is my notes about LSID but it has nothing todo with the current "LSID wars".

There is no or no good documentation about the life science identifiers(LSID). Did you just try to read the specs ? houch... I'm a biologist not a network engineer. Fortunately the sources of the firefox add-on for LSID where very informative. It shows what happen when you enter a LSID in the browser. (Note: Roderic Page has also implemented is own firefox extension for LSID, see http://lsid.mozdev.org/))

Say, you have a LSID identifier:



The third part of this uri (ubio.org) is called the authority. By default the plugin looks at http://ubio.org:9090/authority to find a "WSDL" file.

OK, and this is where I've got a problem: the default behavior of the firefox add-on failed in most cases I've tested.
For instance, dcc.hapmap.org:9090/authority does not work with urn:LSID:dcc.hapmap.org:Individual:JA18942:1. So I guess that there must be a way to find this authority from the LSID itself (biomoby?) but I still have not find how.

Here I added the server http://www.ubio.org/authority in the preferences of the add-ons just because I found the URL by chance


Here is the WSDL file found at http://www.ubio.org/authority:

<?xml version="1.0"?>
<wsdl:definitions xmlns:tns="http://www.hyam.net/lsid/Authority"
targetNamespace="http://www.hyam.net/lsid/Authority"
xmlns:wsdl="http://schemas.xmlsoap.org/wsdl/"
xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:httpsns="http://www.omg.org/LSID/2003/AuthorityServiceHTTPBindings">

<import namespace="http://www.omg.org/LSID/2003/AuthorityServiceHTTPBindings" location="LSIDAuthorityServiceHTTPBindings.wsdl" />

<wsdl:service name="MyAuthorityHTTPService">
<wsdl:port name="MyAuthorityHTTPPort" binding="httpsns:LSIDAuthorityHTTPBinding">
<httpsns:address location="http://www.ubio.org/authority/index.php" />
</wsdl:port>
</wsdl:service>

</wsdl:definitions>




Tthe prefix associated with the namespace http://www.omg.org/LSID/2003/AuthorityServiceHTTPBindings is then searched (There may be other bindings than HTTP: SOAP, FTP...). Here it is httpsns. The <wsdl:port> element having an attribute binding containing httpsns:LSIDAuthorityHTTPBinding" contains a child element with an attribute "location". Here the value of "location" is "http://www.ubio.org/authority/index.php". We then ask some informations about our LSID from this URL by adding a parameter "lsid=[the-lsid]" at the end of the URL: http://www.ubio.org/authority/index.php?lsid=urn:lsid:ubio.org:namebank:11815. The result is, again a WSDL file:

<?xml version="1.0"?>
<definitions xmlns:tns="http://www.example.org/SampleDataServices"
targetNamespace="http://www.example.org/SampleDataServices"
xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns="http://schemas.xmlsoap.org/wsdl/"
xmlns:http="http://schemas.xmlsoap.org/wsdl/http/"
xmlns:httpsns="http://www.omg.org/LSID/2003/DataServiceHTTPBindings"
>

<import namespace="http://www.omg.org/LSID/2003/DataServiceHTTPBindings" location="LSIDDataServiceHTTPBindings.wsdl" />

<!-- Example HTTP GET Services (urlEncoding) -->
<service name="MyDataHTTPService">
<port name="MyDataServiceHTTPPort" binding="httpsns:LSIDDataHTTPBinding">
<http:address location="http://www.ubio.org/authority/data.php" />
</port>
</service>
<service name="MyMetadataHTTPService">
<port name="MyMetadataServiceHTTPPort" binding="httpsns:LSIDMetadataHTTPBinding">
<http:address location="http://www.ubio.org/authority/metadata.php" />
</port>
</service>
</definitions>


Again, from this xml file, we obtain two URLs: http://www.ubio.org/authority/data.php and http://www.ubio.org/authority/metadata.php are the URL respectively used to fetch the data and the metadata about the LSID.

http://www.ubio.org/authority/data.php?lsid=urn:lsid:ubio.org:namebank:11815 returns
http://www.ubio.org/authority/data.php?lsid=urn:lsid:ubio.org:namebank:11815 (???)

Here is the Metadata/RDF file fetched from http://www.ubio.org/authority/metadata.php?lsid=urn:lsid:ubio.org:namebank:11815
<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:ubio="urn:lsid:ubio.org:predicates:"
xmlns:gla="urn:lsid:lsid.zoology.gla.ac.uk:predicates:"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">

<rdf:Description rdf:about="urn:lsid:ubio.org:namebank:11815">
<dc:identifier>urn:lsid:ubio.org:namebank:11815</dc:identifier>
<dc:creator rdf:resource="http://www.ubio.org"/>
<dc:subject>Pternistis leucoscepus (Gray, GR) 1867</dc:subject>
<ubio:taxonomicGroup>Aves</ubio:taxonomicGroup>
<ubio:recordVersion>4</ubio:recordVersion>
<ubio:canonicalName>Pternistis leucoscepus</ubio:canonicalName>
<dc:title>Pternistis leucoscepus</dc:title>
<dc:type>Scientific Name</dc:type>
<ubio:lexicalStatus>Unknown (Default)</ubio:lexicalStatus>
<gla:rank>Species</gla:rank>
<gla:vernacularName rdf:resource="urn:lsid:ubio.org:namebank:954940"/>
<gla:vernacularName rdf:resource="urn:lsid:ubio.org:namebank:954941"/>
<gla:vernacularName rdf:resource="urn:lsid:ubio.org:namebank:1564236"/>
<gla:vernacularName rdf:resource="urn:lsid:ubio.org:namebank:783787"/>
<gla:vernacularName rdf:resource="urn:lsid:ubio.org:namebank:1580313"/>
<gla:mapping rdf:resource="http://starcentral.mbl.edu/microscope/portal.php?pagetitle=classification&amp;BLCHID=12-4498"/>
<gla:mapping rdf:resource="http://www.cbif.gc.ca/pls/itisca/next?v_tsn=553857&amp;taxa=&p_format=&p_ifx=cbif&p_lang="/>
<gla:hasBasionym rdf:resource="urn:lsid:ubio.org:namebank:12292"/>
<gla:objectiveSynonym rdf:resource="urn:lsid:ubio.org:namebank:12292"/>
<gla:objectiveSynonym rdf:resource="urn:lsid:ubio.org:namebank:1762007"/>
<gla:objectiveSynonym rdf:resource="urn:lsid:ubio.org:namebank:1762032"/>
<gla:objectiveSynonym rdf:resource="urn:lsid:ubio.org:namebank:1762051"/>
<gla:objectiveSynonym rdf:resource="urn:lsid:ubio.org:namebank:3408791"/>
<ubio:hasCAVConcept rdf:resource="urn:lsid:ubio.org:classificationbank:1116259"/>
<ubio:hasCAVConcept rdf:resource="urn:lsid:ubio.org:classificationbank:1137821"/>
<ubio:hasCAVConcept rdf:resource="urn:lsid:ubio.org:classificationbank:1173817"/>
<ubio:hasCAVConcept rdf:resource="urn:lsid:ubio.org:classificationbank:1174615"/>
<ubio:hasCAVConcept rdf:resource="urn:lsid:ubio.org:classificationbank:1416177"/>
<ubio:hasCAVConcept rdf:resource="urn:lsid:ubio.org:classificationbank:1672192"/>
<ubio:hasCAVConcept rdf:resource="urn:lsid:ubio.org:classificationbank:2233032"/>
<ubio:hasCAVConcept rdf:resource="urn:lsid:ubio.org:classificationbank:12798879"/>
<ubio:hasCAVConcept rdf:resource="urn:lsid:ubio.org:classificationbank:1909656"/>
<ubio:hasCAVConcept rdf:resource="urn:lsid:ubio.org:classificationbank:2304281"/>
<dcterms:bibliographicCitation>Sclater, W.L., Systema Avium Æthiopicarum, p. 91</dcterms:bibliographicCitation>
</rdf:Description>
</rdf:RDF>

13 July 2007

NAR, Web Server issue July 2007



The annual "Web Server Issue" of "Nucleic Acids Research" is available at :http://nar.oxfordjournals.org/content/vol35/suppl_2/index.dtl?etoc. This issue reports on 130 web servers.

Pierre

URL +1, LSID -1

"URL +1, LSID -1" is the name of the current thread on "public-semweb-lifesci":
http://www.mail-archive.com/public-semweb-lifesci@w3.org/index.html#02766
This discussion (worth looking) is about the life science identifier 'LSID) and it was started by Eric Jain:


In the latest release of UniProt (11.3), all URIs of the form:

urn:lsid:uniprot.org:{db}:{id}

have been replaced with URLs:

http://purl.uniprot.org/{db}/{id}

In general, these URLs can be resolved to a human readable web page (a few are still broken, will be fixed). Some of these web pages may (or may not) be linked to a machine-readable representation via link-rel=alternate.

As an optimization for "Semantic Web" crawlers, there is experimental support for "Accept" headers (i.e. set it to "application/rdf+xml").

Some examples:

http://purl.uniprot.org/uniprot/P12345
http://purl.uniprot.org/taxonomy/9606
http://purl.uniprot.org/pdb/1BRC

Among the protagonists we can find Roderic Page, Michel Dumontier, Mark Wilkinson, Alan Ruttenberg, Dany Ayers, etc...


Life Science Identifiers (LSIDs) are persistent, location-independent, resource identifiers for uniquely naming biologically significant resources including species names, concepts, occurrences, genes or proteins, or data objects that encode information about them. To put it simply, LSIDs are a way to identify and locate pieces of biological information on the web.

As far I understand LSID, we all should use lsid:ncbi.nlm.nih.gov:pubmed:12507336 instead of http://www.ncbi.nlm.nih.gov/sites/entrez?Db=pubmed&Cmd=ShowDetailView&TermToSearch=12507336&ordinalpos=1&itool=EntrezSystem2.PEntrez.Pubmed.Pubmed_ResultsPanel.Pubmed_RVDocSum or http://www.ncbi.nlm.nih.gov/sites/entrez?Db=pubmed&Cmd=ShowDetailView&uid=12507336. (Note that the two later URL are not the same but they point to the same article). An LSID resolver can also be used to find/discover some other (RDF based) properties about your object.

In the thread a firefox extension resolving lSID uri was described: I just installed it on my firefox and it looks nice and the code looks really interesting: it shows how to create a firefox extension which will insert a new handler for a new internet protocol named "lsidres:".


(...)
LsidModule.registerSelf = function (compMgr, location, loaderStr, type){

// http://developer.mozilla.org/xpcom/api/nsIComponentRegistrar/
compMgr = compMgr.QueryInterface(Components.interfaces.nsIComponentRegistrar);
compMgr.registerFactoryLocation(LSIDPROT_HANDLER_CID,
"Protocol handler for LSID",
"@mozilla.org/network/protocol;1?name=lsidres",
location, loaderStr, type);

}
(...)



Then when a hyperlink in a HTML page (such as lsidres:urn:lsid:ubio.org:namebank:11815) is activated, firefox open a new window, calls a remote LSID resolver and displays the properties of your object.

Pierre

04 July 2007

Systems-Biology using GoogleGears: my notebook


Google gears is an open source browser extension that enables web applications to provide offline functionality. The data are stored locally in a fully-searchable relational database using the sqlite engine.


My Biological Network is a tool I created as a test to play with Google gears: it is used to build a network of protein-protein interactions. It uses Google Gears to record your entries on the local disk, so Gears needs to be installed on your computer. Programming with gears with JAVASCRIPT is really cool as you don't have to implement the storage of the data on the server side and you're using some standard SQL statements to handle the data.




Screenshots


My Biological Network


Tutorial


Open the tab Organism (fig. 4): add one or more organism. (Homo Sapiens already inserted by default)

Open the tab Protein (fig. 1): add one or more protein.

Open the tab Paper (fig. 3): add one or more article that will be used as an evidence for an interaction.

Open the tab Technology (fig. 2): add one or more technology that was used to characterize an interaction.

Open the tab Component: add one or more cellular component using Gene Ontology (GO:0005575 \"cellular component\" was inserted by default)

Open the tab Interaction (fig. 5):


  • Name and describe this interaction

  • Select one or more protein and/or one or more previously defined proteic complex. You Cannot describe self interactions with this tool.

  • (optional) choose one or more paper/technology/component...



Open the RDF table (fig. 6): I choose to display the content of the database using RDF. Such format can then be validated and visualized using the W3C RDF validator, or transformed using XSLT, etc.... I also used the life science identifier (LSID) as an URI for my resources.


On my computer, the database is stored in /env/islande/home/lindenb/.mozilla/firefox/<profile-id>/Google Gears for Firefox/islande/<host>/mynetwork#database. The database can be manualy accessed using sqlite3:

sqlite3 mynetwork#database
SQLite version 3.4.0
Enter '.help' for instructions
sqlite> .tables
component interactionhash paper technology
interaction organism prote
sqlite> .schema organism
CREATE TABLE organism(id integer primary key ,name varchar(50) not null unique);
sqlite> select * from organism;
9606|Homo Sapiens
sqlite>


Internals


We the page is loaded, we check that gears was installed



if (!window.google || !google.gears) {
debug("NOTE: You must install Google Gears first.")

We then create the database if does not exist. The file is created in firefox in ${HOME}/.mozilla/firefox/<profile-id>/Google Gears for Firefox/<server>/mynetwork#database


connection = google.gears.factory.create("beta.database","1.0");

I create the tables just by invoking some standards SQL 'CREATE TABLE' statements. I also insert some default values (e.g. human organism)



connection.execute("create table if not exists organism(id integer primary key ,name varchar(50) not null unique)");
connection.execute("insert or ignore into organism(id,name) values(9606,\"Homo Sapiens\")");
connection.execute("create table if not exists protein(id integerprimary key autoincrement,name varchar(50) not null,taxId int not null,acn varchar(50) not null unique)");
connection.execute("create table if not exists paper(pmid integerprimary key ,title varchar(255) not null,citation varchar(255) not null,firstAuthor varchar(50) not null)");
connection.execute("create table if not exists component(id integer primary key autoincrement,go varchar(50) not null unique, name varchar(50) not null unique)");

connection.execute("insert or ignore into component(go,name) values(\"GO:0005575\",\"cellular component\")");
connection.execute("insert or ignore into component(go,name) values(\"GO:0008372\",\"cellular component unknown\")");

connection.execute("create table if not exists technology(id integer primary key autoincrement,name varchar(50) not null unique, description varchar(255) not null)");

connection.execute("insert or ignore into technology(name,description) values(\"Y2H\",\"Yeast Two Hybrid System\")");
connection.execute("insert or ignore into technology(name,description) values(\"CoIP\",\"Co-Immuno Precipitation\")");


connection.execute("create table if not exists interaction(id integer primary key autoincrement, name varchar(50) not null unique,description varchar(255) not null)");
connection.execute("create table if not exists interactionhash(id integer primary key autoincrement,LINK_interaction int ,type varchar(20) not null,child int not null)");

When a data is about to be inserted we check all the fields and we insert them using SQL: INSERT INTO


var id= getById("organism-input-id");
if(!isInteger(id.value))
{
debug("TaxId not a Number");
return;
}
var name=getById("organism-input-name");
if(trim(name.value).length==0)
{
debug("Taxon Name empty");
return;
}

try
{
connection.execute("insert into organism(id,name) values("+sqlescape(trim(id.value))+","+sqlquote(trim(name.value))+")");
id.value="";
name.value="";
}
catch(err)
{
debug(err.message);
return;
}

a simple SELECT is used to retrieve the data and insert them in a HTML table



var rs= connection.execute("select id,name from organism order by name");
while (rs.isValidRow())
{
var tr= ce("tr");
table.appendChild(tr);
var td= ce("td");
tr.appendChild(td);
td.appendChild(ct(rs.field(0)));

td= ce("td");
tr.appendChild(td);
var a= ce("a");
a.setAttribute("title","Open in NCBI");
a.setAttribute("target","tax"+rs.field(0));
a.setAttribute("href","http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id="+rs.field(0));
td.appendChild(a);
a.appendChild(ct(rs.field(1)));
rs.next();
}
rs.close();



That's it !

Pierre

updated 2010-08-12: source code

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<script type="text/javascript" src="gears_init.js"></script>
<script type="text/javascript" src="network.js"></script>
<link rel="stylesheet" type="text/css" href="./network.css" />
<title>My Biological Network</title>
</head>
<body onload="init()">
<h1>My Biological Network</h1>
<p>Pierre Lindenbaum PhD <a href="mailto:plindenbaum@yahoo.fr">plindenbaum@yahoo.fr</a><br/><a href="http://plindenbaum.blogspot.com">http://plindenbaum.blogspot.com</a><br/><address>Bioinformatics department<br/><a href="http://www.integragen.com">Integragen S.A.</a><br/>Evry, France</address></p>
<p/>
<div>
<button onclick="javascript:showCard('home-pane');">Home</button>
<button onclick="showOrganismPane()">Organisms</button>
<button onclick="showProteinPane()">Proteins</button>
<button onclick="showPaperPane()">Papers</button>
<button onclick="showTechnologyPane()">Technology</button>
<button onclick="showComponentPane()">Component</button>
<button onclick="showInteractionPane()">Interactions</button>
<button onclick="showRDFPane()">RDF</button>
</div>
<div style="color:red;" id="stderr"></div>
<p/>

<!-- ====================================== ORGANISM ====================================== -->
<div style="display:none;" id="organism-pane">
<table>
<caption>Add an Organism</caption>
<tr><th>NCBI Taxon ID <i>(e.g. 10912)</i></th><td><input id="organism-input-id" length="10"/></td></tr>
<tr><th>NCBI Taxon Name <i>(e.g. Rotavirus)</i></th><td><input id="organism-input-name" length="10"/></td></tr>
<tr><th/><td><button onclick="addOrganism()">Add</button></td></tr>
</table>

<hr/>

<table width="80%">
<caption>All Organisms</caption>
<thead>
<tr><th>Taxon ID</th><th>Taxon Name</th></tr></tr>
</thead>
<tbody id="organism-table">
</tbody>
</table>

</div>

<!-- ====================================== COMPONENT ====================================== -->
<div style="display:none;" id="component-pane">
<table>
<caption>Add a Component</caption>
<tr><th>Name</th><td><input id="component-input-name" length="10"/></td></tr>
<tr><th>GO</th><td><input id="component-input-go" length="10"/></td></tr>
<tr><th/><td><button onclick="addComponent()">Add</button></td></tr>
</table>

<hr/>

<table width="80%">
<caption>All Components</caption>
<thead>
<tr><th>Name</th><th>GO</th></tr>
</thead>
<tbody id="component-table">
</tbody>
</table>

</div>

<!-- ====================================== TECHNOLOGY ====================================== -->
<div style="display:none;" id="technology-pane">
<table>
<caption>Add a Technology</caption>
<tr><th>Name</th><td><input id="technology-input-name" length="50"/></td></tr>
<tr><th>Description</th><td><input id="technology-input-desc" length="50"/></td></tr>
<tr><th/><td><button onclick="addTechnology()">Add</button></td></tr>
</table>

<hr/>

<table width="80%">
<caption>All Technologies</caption>
<thead>
<tr><th>Name</th><th>Description</th></tr></tr>
</thead>
<tbody id="technology-table">
</tbody>
</table>

</div>


<!-- ====================================== PROTEIN ====================================== -->

<div style="display:none;" id="protein-pane">
<table>
<caption>Add a Protein</caption>
<tr><th>Uniprot accession number <i>(e.g. Q3T8J2)</i></th><td><input id="protein-input-acn" length="10"/></td></tr>
<tr><th>Uniprot Name <i>(e.g. Replicase polyprotein 1ab)</i></th><td><input id="protein-input-name" length="10"/></td></tr>
<tr><th>Organism</th><td><select id="protein-input-taxon" length="10"><option>A</option></select></td></tr>
<tr><th/><td><button onclick="addProtein()">Add</button></td></tr>
</table>

<hr/>

<table width="80%">
<caption>All Proteins</caption>
<thead>
<tr><th>Primary accession</th><th>Name</th><th>Taxon</th></tr></tr>
</thead>
<tbody id="protein-table">
</tbody>
</table>

</div>

<!-- ====================================== PAPER ====================================== -->
<div style="display:none;" id="paper-pane">
<table>
<caption>Add a Paper</caption>
<tr><th>PMID</th><td><input id="paper-input-pmid" length="10"/></td></tr>
<tr><th>Title</th><td><input id="paper-input-title" length="50"/></td></tr>
<tr><th>Citation</th><td><input id="paper-input-citation" length="50"/></td></tr>
<tr><th>First Author</th><td><input id="paper-input-author" length="50"/></td></tr>
<tr><th/><td><button onclick="addPaper()">Add</button></td></tr>
</table>

<hr/>

<table width="80%">
<caption>All Papers</caption>
<thead>
<tr><th>PMID</th><th>Citation</th><th>First Author</th><th>Title</th></tr></tr>
</thead>
<tbody id="paper-table">
</tbody>
</table>

</div>

<!-- ====================================== INTERACTION ====================================== -->


<div style="display:none;" id="interaction-pane">

<table>
<caption>Add an Interaction</caption>
<tr><th>Name</th><td colspan="4"><input id="interaction-input-name" length="50"/></td></tr>
<tr><th>Description</th><td colspan="4"><input id="interaction-input-desc" length="50"/></td></tr>
<tr>
<th>Protein</th>
<th>Interactors</th>
<th>Methods</th>
<th>Evidences</th>
<th>Components</th></tr>
<tr>
<td><select id="interactors-input-proteins" size="5" multiple="true"/></td>
<td><select id="interactors-input-interactors" size="5" multiple="true"></td>
<td><select id="interactors-input-technologies" size="5" multiple="true"></td>
<td><select id="interactors-input-evidences" size="5" multiple="true"></td>
<td><select id="interactors-input-components" size="5" multiple="true"></td>
</tr>
<tr><th colspan="4"/><td><button onclick="addInteraction()">Add</button></td></tr>
</table>

<hr/>

<table width="80%">
<caption>All Interactions</caption>
<thead>
<tr><th>Name</th><th>Description</th></tr></tr>
</thead>
<tbody id="interaction-table">
</tbody>
</table>

</div>

<!-- ====================================== RDF ====================================== -->
<div style="display:none;" id="rdf-pane">
<h2>RDF Pane</h2>
<textarea wrap="off" id="rdf-area" rows="20" cols="80"></textarea>

</div>

<!-- ====================================== HOME ====================================== -->
<div style="display:none;" id="home-pane">
<h3>About My Biological Network</h3>
<p><a href="http://gears.google.com/">Google gears</a> is an open source browser extension that enables web applications to provide offline functionality. The data are stored locally in a fully-searchable relational database using the <a href="http://www.sqlite.org/">sqlite engine</a>.</p>
<p><b>My Biological Network</b> is a tool I created as a test to play with Google gears: it is used to build a network of protein-protein interactions. It uses Google Gears to record your entries on the <u>local disk</u>, so Gears needs to be installed on your computer. </p>

<p>
Open the tab <b>Organism</b>: add one or more organism. (Homo Sapiens already inserted by default)<br/>
Open the tab <b>Protein</b>: add one or more protein.<br/>
Open the tab <b>Paper</b>: add one or more article that will be used as an evidence for an interaction.<br/>
Open the tab <b>Technology</b>: add one or more technology that was used to characterize an interaction.<br/>
Open the tab <b>Component</b>: add one or more cellular component using Gene Ontology (GO:0005575 \"cellular component\" was inserted by default)<br/>
Open the tab <b>Interaction</b>:<ul>
<li>Name and describe this interaction</li>
<li>Select one or more protein and/or one or more previously defined proteic complex. You <i>Cannot</i> describe self interactions with this tool.<li>
<li>(optional) choose one or more paper/technology/component...</li>
</ul><br/>
Open the <b>RDF table</b>: I choose to display the content of the database using <a href="http://www.w3.org/RDF/">RDF</a>. Such format can then be validated and visualized using the <a href="http://www.w3.org/RDF/Validator/">W3C RDF validator</a>, or transformed using <a href="http://www.w3.org/TR/xslt">XSLT</a>, etc.... I also used the <a href="http://lsid.sourceforge.net/">life science identifier (LSID)</a> as an URI for my resources.<br/>

</p>

<p>On my computer, the database is stored in <code>$HOME/.mozilla/firefox/&lt;profile-id&gt;/Google Gears for Firefox/&lt;host&gt;/mynetwork#database</code>. The database can be manualy accessed using <a href="http://www.sqlite.org/">sqlite3</a>:<pre style='color:black;border:1pt solid;background:lightgray;'>sqlite3 mynetwork#database
SQLite version 3.4.0
Enter &apos;.help&apos; for instructions
sqlite&gt; .tables
component interactionhash paper technology
interaction organism prote
sqlite&gt; .schema organism
CREATE TABLE organism(id integer primary key ,name varchar(50) not null unique);
sqlite&gt; select * from organism;
9606|Homo Sapiens
sqlite&gt;</pre>

</p>

</div>


<!-- google analytics -->

<script src="http://www.google-analytics.com/urchin.js"
type="text/javascript">
</script>
<script type="text/javascript">
_uacct = "XXXXXX";
urchinTracker();
</script>

<!-- google analytics -->


</body>
</html>