10 July 2006

Mysql user defined function (UDF) for Bioinformatics.

MYSQL allows to create User Defined Functions (UDF). Written in 'C/C++', this kind of function can be used to embed bioinformatics into mysql. Here is an example of a function used to translate a DNA sequence into a protein directly in mysql.

#include <my_global.h>
#include <m_ctype.h>
#include <mysql.h>
#include <m_string.h>

/* a function translating 3 bases into an amino acid */
static char translation(char a,char b,char c);

/* The initialization function */
my_bool translate_init(UDF_INIT *initid, UDF_ARGS *args, char *message);
/* The deinitialization function */
void translate_deinit(UDF_INIT *initid);
/* The main function. This is where the function result is computed */
char *translate(UDF_INIT *initid, UDF_ARGS *args, char *result,
unsigned long *length, char *is_null, char *error);

/* The initialization function */
my_bool translate_init(
UDF_INIT *initid,
UDF_ARGS *args,
char *message
)
{
/* check the args */
if (!(args->arg_count == 1 && args->arg_type[0] == STRING_RESULT ))
{
strncpy(message,"Bad parameter expected a DNA",MYSQL_ERRMSG_SIZE);
return 1;
}
initid->maybe_null=1;
initid->ptr= (char*)malloc(0);

if(initid->ptr==NULL)
{
strncpy(message,"Out Of Memory",MYSQL_ERRMSG_SIZE);
return 1;
}
return 0;
}

/* The deinitialization function */
void translate_deinit(UDF_INIT *initid)
{
/* free the memory **/
if(initid->ptr!=NULL) free(initid->ptr);
}

/* The main function. This is where the function result is computed */
char *translate(UDF_INIT *initid, UDF_ARGS *args, char *result,
unsigned long *length, char *is_null, char *error)
{
long i;
long dnaLength= args->lengths[0];
const char *dna=args->args[0];
char *ptr=NULL;

if (dna==NULL) /* Null argument */
{
*is_null=1;
return NULL;
}
*length=dnaLength/3;
ptr= (char*)realloc(initid->ptr,sizeof(char)*(*length));
if(ptr==NULL)
{
*is_null=1;
*error=1;
strncpy(error,"Out Of Memory",MYSQL_ERRMSG_SIZE);
return NULL;
}
initid->ptr=ptr;
/* loop over the codons of the sequence */
int j=0;
for(i=0;i+2< dnaLength;i+=3)
{
initid->ptr[j++]=translation(dna[i],dna[i+1],dna[i+2]);
}

return initid->ptr;
}

/************************************
*
* translation
* a function translating 3 bases into an amino acid
*/
static
char translation(char base1,char base2,char base3)
{
(...)/* so obvious.... */
}


And here is the Makefile for my machine...

/usr/lib/translate.so:translate.c
gcc -fPIC -shared -I/usr/include/mysql -DDBUG_OFF -O3 -lmysqlclient -o $@ $<


... and the fragment from a session

mysql> CREATE FUNCTION translate RETURNS STRING SONAME 'translate.so';
Query OK, 0 rows affected (0,03 sec)

mysql> select translate("ATGGAGTCTACTCAGCAGATGGCTTCTTCTATTATTAATTCTTCATTTGAAGCT
AATTGATGGGTATTCAATATGACTACAATGAGGTATATACTAGAGTAAAGAGTAAATTTGATTTAGTTATGGATGATTC
GCAATTACTATTGATCAAGCTTTGAATGGAAAATTTAGTTCAGCGATTAGGAATAGAAATTGGATGACTGACTCTCGAA
TAAACTAAGAATTATGCTATCATCAAAAGGAATCGATCAGAAAATGAGAGTGCTTAATGCTTGTTTTAGTGTCAAGAGA
AATGTACTAGACTGATGAAAGACAAATTAGAACGTGGTGAAGTTGAAGTTGATGATTCCTTTGTTGAAGAGAAAATGGA
TATGAACAGTTAGAAAAGAGATTTGAGTCACTGAAACATCGGGTTAATGAGAAGTATAATCATTGGGTTCTTAAAGCTA
TCAAAATGTGATTT
") as NSP3;
+------------------------------------------------------------------------------ -------------------------------------------------------------------------------
| NSP3
+------------------------------------------------------------------------------
-------------------------------------------------------------------------------
| MESTQQMASSIINSSFEAAVVAATSTLELMGIQYDYNEVYTRVKSKFDLVMDDSGVKNNLIGKAITIDQALNGKFSS
SSKGIDQKMRVLNACFSVKRIPGKSSSIVKCTRLMKDKLERGEVEVDDSFVEEKMEVDTIDTKSRYEQLEKRFESLKHR

+------------------------------------------------------------------------------
-------------------------------------------------------------------------------
1 row in set (0,00 sec)


2 comments:

Anonymous said...

Here's yet another example of how c/c++ is a dangerous language, particularly if this function were exposed on a public web server:

*length=dnaLength/3;
ptr= (char*)realloc(initid->ptr,sizeof(char)*(*length));
snip...
initid->ptr=ptr;
/* loop over the codons of the sequence */
int j=0;
for(i=0;i+2< dnaLength;i+=3)
{
initid->ptr[j++]=translation(dna[i],dna[i+1],dna[i+2]);
}

Whoops! If the length of the input DNA sequence were not a multiple of 3, this code would write beyond the allocated space on the heap and could be used in a buffer overflow attack, potentially enabling the attacker to execute arbitrary code on the web server.

This post is not meant to criticize the author, but rather, to underscore the constant danger in the casual use of c/c++.

Pierre Lindenbaum said...

Hi, thank you to have a look at this code :-)
am I wrong ? but I still don't see the problem (I may be wrong, this is the morning and I need a strong coffee). Say the length of your DNA is 4 then *length= 4/3= 1 : only one char is allocated (no need to allocate one extra char for the '\0')
+ first loop: i=0,j=0 we write at ptr[j=0] */
+ 2nd loop: i=3,j=1 (3+2>=dnaLength) the loop breaks

if dnaLength==5
*length= 5/3= 1
+ first loop: i=0,j=0 we write at ptr[j=0] */
+ 2nd loop: i=3,j=1 (3+2>=dnaLength) the loop breaks

no ?
where is my coffee ?