A mysql user defined function to get the reverse complement of a sequence
I was asked today to compare some experimental genotypes to the
theoretical data. Unfortunaly many genotypes were given on the
anti-parallele strand compared to their references. So I wrote a small
href="http://dev.mysql.com/doc/refman/5.0/en/adding-functions.html">mysql
user defined function (UDF) to implement a new function
called 'revcomp' in mysql used to return the reverse complement of
a DNA sequence. Written in 'C/C++', this kind of function can be used
to embed bioinformatics into mysql. The coding was straighforward as I
already wrote a UDF translating a cDNA to a proteic sequence in a href="http://plindenbaum.blogspot.com/2006/07/mysql-user-defined-function-udf-for.html">previous
post.
Here is the code.
#include <my_global.h>
#include <m_ctype.h>
#include <mysql.h>
#include <m_string.h>
/* this function is called by mysql to initialize it */
my_bool revcomp_init(UDF_INIT *initid, UDF_ARGS *args, char *message);
/* this function is called by mysql to dispose it */
void revcomp_deinit(UDF_INIT *initid);
/* the main function with get the reverse complement of a dna */
char *revcomp(UDF_INIT *initid, UDF_ARGS *args, char *result,
unsigned long *length, char *is_null, char *error);
/* a trivial function returning the complementary base of an acid nucleic */
static char complement(char b)
{
switch(b)
{
case 'A': return 'T';
case 'T': return 'A';
case 'G': return 'C';
case 'C': return 'G';
case 'a': return 't';
case 't': return 'a';
case 'g': return 'c';
case 'c': return 'g';
case 'w': return 'w';
case 'W': return 'W';
case 's': return 's';
case 'S': return 'S';
case 'y': return 'r';
case 'Y': return 'R';
case 'r': return 'y';
case 'R': return 'Y';
case 'k': return 'm';
case 'K': return 'M';
case 'm': return 'k';
case 'M': return 'K';
case 'b': return 'v';
case 'd': return 'h';
case 'h': return 'd';
case 'v': return 'b';
case 'B': return 'V';
case 'D': return 'H';
case 'H': return 'D';
case 'V': return 'B';
case 'N': return 'N';
case 'n': return 'n';
}
return '?';
}
/** this function is called by mysql to initialize our revcomp function */
my_bool revcomp_init(
UDF_INIT *initid,
UDF_ARGS *args,
char *message
)
{
/* check we have one STRING argument */
if (!(args->arg_count == 1 && args->arg_type[0] == STRING_RESULT ))
{
strncpy(message,"Bad parameter, expected a DNA",MYSQL_ERRMSG_SIZE);
return 1;
}
initid->maybe_null=1;
/* initid->ptr will be used to store the transformed sequence */
initid->ptr= (char*)malloc(0);
/* out of memory ? */
if(initid->ptr==NULL)
{
strncpy(message,"Out Of Memory",MYSQL_ERRMSG_SIZE);
return 1;
}
return 0;
}
/** this function is called by mysql to dispose our revcomp function */
void revcomp_deinit(UDF_INIT *initid)
{
/* free the user ptr */
if(initid->ptr!=NULL) free(initid->ptr);
}
/** this is the function called by mysql to reverse-complement a DNA */
char *revcomp(UDF_INIT *initid, UDF_ARGS *args, char *result,
unsigned long *length, char *is_null, char *error)
{
long i;
/* the size of the input */
long size= args->lengths[0];
/* the DNA given as input */
const char *dna=args->args[0];
char *ptr=NULL;
if (dna==NULL) // DNA is a null argument
{
*is_null=1;
return NULL;
}
/* the length of the returned string will be 'size' */
*length=size;
/** try to reallocate our memory to store the new transformed DNA sequence */
ptr= (char*)realloc(initid->ptr,sizeof(char)*(size));//no need (size+1)
/* out of memory ? */
if(ptr==NULL)
{
*is_null=1;
*error=1;
strncpy(error,"Out Of Memory",MYSQL_ERRMSG_SIZE);
return NULL;
}
initid->ptr=ptr;
*is_null=0;
*error=0;
/* build the reverse complement */
for(i=0;i< size;++i)
{
initid->ptr[i] = complement( dna[(size-1)-i] );
}
/* return our pointer */
return initid->ptr;
}
And here is the Makefile for my machine...
/usr/lib/revcomp.so:revcomp.c
gcc -fPIC -shared -I/usr/include/mysql -DDBUG_OFF -O3 -lmysqlclient -o $@ $<
The function was installed on mysql using the following statement:
>mysql create function revcomp returns string SONAME "revcomp.so";
TESTS:
mysql> select revcomp("GAATTC");
+-------------------+
| revcomp("GAATTC") |
+-------------------+
| GAATTC |
+-------------------+
1 row in set (0,00 sec)
mysql> select revcomp(revcomp("AAATTTaaatttGC"));
+------------------------------------+
| revcomp(revcomp("AAATTTaaatttGC")) |
+------------------------------------+
| AAATTTaaatttGC |
+------------------------------------+
1 row in set (0,00 sec)
mysql> select revcomp("SWatgcatgAAATTTaaatttGC");
+------------------------------------+
| revcomp("SWatgcatgAAATTTaaatttGC") |
+------------------------------------+
| GCaaatttAAATTTcatgcatWS |
+------------------------------------+
1 row in set (0,00 sec)
here I create a table of primers and I find the all sub-sequences that could be amplified.
mysql> create table primers(id int primary key auto_increment, primer
varchar(100));
Query OK, 0 rows affected (0,02 sec)
mysql> desc primers;
+--------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+--------+--------------+------+-----+---------+----------------+
| id | int(11) | | PRI | NULL | auto_increment |
| primer | varchar(100) | YES | | NULL | |
+--------+--------------+------+-----+---------+----------------+
2 rows in set (0,00 sec)
mysql> insert into primers(primer) values
("CAAAATGAAACAGGT"),
("TAATCAAATAATGCCTGGATTTCTT"),
("TGAACATCCGTCCTCTCCCCACAAA"),
("TCATAGTCCTGAGGAAAGAGAA"),
("TCAAACAAGGAAAATGGAAAACAAATTCA"),
("TCACTGCTGGATGTGTGGGAAAAACTGCA"),
("CACTGTTGCAGTTTTTCCCACAC"),
("TCAGGACTATGAGCAAAGGAACA"),
("TACCAAGTACCTGCGCTCCAGGTACATTT"),
("AGATACCTGTTTCATTT"),
("CTTTTCAGTGGTTGATGCTCAAGATG"),
("GCCTGACGAAGTTAAAACTGATATTGA")
;
Query OK, 12 rows affected (0,00 sec)
Records: 12 Duplicates: 0 Warnings: 0
mysql> set
@seq:="CACTGCTTCTCACTGTTGCAGTTTTTCCCACACATCCAGCAGTGAAAGTCCACTGTAACTTCAGCATAAT
CTGTTGGCATGTGAATTTGTTTTCCATTTTCCTTGTTTGACTGACTGGCTATGTCTTCACTTTTTTCATTTTTTTTG
ACTCTTGAGCCATAGTTCGTAAAATTGCTCCATATCTTGTATCCCATTCTCCTTCATGTAAGTCCAAACTTCTCTTT
CCTCAGGACTATGAGCAAAGGAACAGTTTCCAACATATTGACATTTTTTCCCAGAAGCAATATGGTTGCACAGATCA
AACTGTAAAGGCATTTGTTTCTTTGTGGGGAGAGGACGGATGTTCATCCACTTCTTACGTTCAATAGACATCACTCT
CATCGCACGCCGGTCTTTGGTCCACGAATGCCTTGCTTTTGCACTACAATATTTTCTGTTTTTGTCTGGTTCAATGA
CTTGACCGTTTCTCAGACACTGGGCGCACACAAACTTTATCTTCATATTAAGAAATCCAGGCATTATTTGATTACCA
AGTACCTGCGCTCCAGGTACATTTGCTTCCAAATTCTGCCAATATCGTTTAGACTCTTGAGCAATAGCATCATGTGA
GATACCTGTTTCATTTTGCCG";
Query OK, 0 rows affected (0,00 sec)
mysql> select
concat("ID.",Forward.id) as "Forward" ,
locate(Forward.primer,@seq) as "Start",
concat("ID.",Revers.id) as "Reverse",
locate(revcomp(Revers.primer),@seq)+length(Revers.primer) as "End",
repeat('*',(locate(revcomp(Revers.primer),@seq)+length(Revers.primer)-locate(Forward.primer,@seq) )/16) as "schema"
from
primers as Forward,
primers as Revers
where
locate(Forward.primer,@seq)>0 and
locate(revcomp(Revers.primer),@seq) > locate(Forward.primer,@seq)
group by 1,3 order by 5,2;
+---------+-------+---------+------+----------------------------------------+
| Forward | Start | Reverse | End | schema |
+---------+-------+---------+------+----------------------------------------+
| ID.10 | 609 | ID.1 | 628 | * |
| ID.7 | 11 | ID.6 | 46 | ** |
| ID.7 | 11 | ID.5 | 111 | ****** |
| ID.9 | 528 | ID.1 | 628 | ****** |
| ID.8 | 227 | ID.3 | 348 | ******* |
| ID.7 | 11 | ID.4 | 239 | ************** |
| ID.8 | 227 | ID.2 | 530 | ****************** |
| ID.7 | 11 | ID.3 | 348 | ********************* |
| ID.8 | 227 | ID.1 | 628 | ************************* |
| ID.7 | 11 | ID.2 | 530 | ******************************** |
| ID.7 | 11 | ID.1 | 628 | ************************************** |
+---------+-------+---------+------+----------------------------------------+
11 rows in set (0,00 sec)
That's it.
Pierre