25 September 2007

A mysql user defined function to get the reverse complement of a sequence

I was asked today to compare some experimental genotypes to the
theoretical data. Unfortunaly many genotypes were given on the
anti-parallele strand compared to their references. So I wrote a small
href="http://dev.mysql.com/doc/refman/5.0/en/adding-functions.html">mysql
user defined function (UDF)
to implement a new function
called 'revcomp' in mysql used to return the reverse complement of
a DNA sequence. Written in 'C/C++', this kind of function can be used
to embed bioinformatics into mysql. The coding was straighforward as I
already wrote a UDF translating a cDNA to a proteic sequence in a href="http://plindenbaum.blogspot.com/2006/07/mysql-user-defined-function-udf-for.html">previous
post
.

Here is the code.


#include <my_global.h>
#include <m_ctype.h>
#include <mysql.h>

#include <m_string.h>
/* this function is called by mysql to initialize it */
my_bool revcomp_init(UDF_INIT *initid, UDF_ARGS *args, char *message);
/* this function is called by mysql to dispose it */
void revcomp_deinit(UDF_INIT *initid);
/* the main function with get the reverse complement of a dna */
char *revcomp(UDF_INIT *initid, UDF_ARGS *args, char *result,
unsigned long *length, char *is_null, char *error);

/* a trivial function returning the complementary base of an acid nucleic */
static char complement(char b)
{
switch(b)
{
case 'A': return 'T';
case 'T': return 'A';
case 'G': return 'C';
case 'C': return 'G';

case 'a': return 't';
case 't': return 'a';
case 'g': return 'c';
case 'c': return 'g';

case 'w': return 'w';
case 'W': return 'W';

case 's': return 's';
case 'S': return 'S';

case 'y': return 'r';
case 'Y': return 'R';

case 'r': return 'y';
case 'R': return 'Y';

case 'k': return 'm';
case 'K': return 'M';

case 'm': return 'k';
case 'M': return 'K';

case 'b': return 'v';
case 'd': return 'h';
case 'h': return 'd';
case 'v': return 'b';


case 'B': return 'V';
case 'D': return 'H';
case 'H': return 'D';
case 'V': return 'B';

case 'N': return 'N';
case 'n': return 'n';

}
return '?';
}


/** this function is called by mysql to initialize our revcomp function */
my_bool revcomp_init(
UDF_INIT *initid,
UDF_ARGS *args,
char *message
)
{
/* check we have one STRING argument */
if (!(args->arg_count == 1 && args->arg_type[0] == STRING_RESULT ))
{
strncpy(message,"Bad parameter, expected a DNA",MYSQL_ERRMSG_SIZE);
return 1;
}
initid->maybe_null=1;
/* initid->ptr will be used to store the transformed sequence */
initid->ptr= (char*)malloc(0);
/* out of memory ? */
if(initid->ptr==NULL)
{
strncpy(message,"Out Of Memory",MYSQL_ERRMSG_SIZE);
return 1;
}
return 0;
}

/** this function is called by mysql to dispose our revcomp function */
void revcomp_deinit(UDF_INIT *initid)
{
/* free the user ptr */
if(initid->ptr!=NULL) free(initid->ptr);
}

/** this is the function called by mysql to reverse-complement a DNA */
char *revcomp(UDF_INIT *initid, UDF_ARGS *args, char *result,
unsigned long *length, char *is_null, char *error)
{
long i;
/* the size of the input */
long size= args->lengths[0];
/* the DNA given as input */
const char *dna=args->args[0];
char *ptr=NULL;

if (dna==NULL) // DNA is a null argument
{
*is_null=1;
return NULL;
}
/* the length of the returned string will be 'size' */
*length=size;


/** try to reallocate our memory to store the new transformed DNA sequence */
ptr= (char*)realloc(initid->ptr,sizeof(char)*(size));//no need (size+1)

/* out of memory ? */
if(ptr==NULL)
{
*is_null=1;
*error=1;
strncpy(error,"Out Of Memory",MYSQL_ERRMSG_SIZE);
return NULL;
}
initid->ptr=ptr;
*is_null=0;
*error=0;

/* build the reverse complement */
for(i=0;i< size;++i)
{
initid->ptr[i] = complement( dna[(size-1)-i] );
}

/* return our pointer */
return initid->ptr;
}



And here is the Makefile for my machine...


/usr/lib/revcomp.so:revcomp.c
gcc -fPIC -shared -I/usr/include/mysql -DDBUG_OFF -O3 -lmysqlclient -o $@ $<




The function was installed on mysql using the following statement:
>mysql create function revcomp returns string SONAME "revcomp.so";


TESTS:


mysql> select revcomp("GAATTC");
+-------------------+
| revcomp("GAATTC") |
+-------------------+
| GAATTC |
+-------------------+
1 row in set (0,00 sec)

mysql> select revcomp(revcomp("AAATTTaaatttGC"));
+------------------------------------+
| revcomp(revcomp("AAATTTaaatttGC")) |
+------------------------------------+
| AAATTTaaatttGC |
+------------------------------------+
1 row in set (0,00 sec)

mysql> select revcomp("SWatgcatgAAATTTaaatttGC");
+------------------------------------+
| revcomp("SWatgcatgAAATTTaaatttGC") |
+------------------------------------+
| GCaaatttAAATTTcatgcatWS |
+------------------------------------+
1 row in set (0,00 sec)



here I create a table of primers and I find the all sub-sequences that could be amplified.


mysql> create table primers(id int primary key auto_increment, primer
varchar(100));
Query OK, 0 rows affected (0,02 sec)

mysql> desc primers;
+--------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+--------+--------------+------+-----+---------+----------------+
| id | int(11) | | PRI | NULL | auto_increment |
| primer | varchar(100) | YES | | NULL | |
+--------+--------------+------+-----+---------+----------------+
2 rows in set (0,00 sec)


mysql> insert into primers(primer) values
("CAAAATGAAACAGGT"),
("TAATCAAATAATGCCTGGATTTCTT"),
("TGAACATCCGTCCTCTCCCCACAAA"),
("TCATAGTCCTGAGGAAAGAGAA"),
("TCAAACAAGGAAAATGGAAAACAAATTCA"),
("TCACTGCTGGATGTGTGGGAAAAACTGCA"),
("CACTGTTGCAGTTTTTCCCACAC"),
("TCAGGACTATGAGCAAAGGAACA"),
("TACCAAGTACCTGCGCTCCAGGTACATTT"),
("AGATACCTGTTTCATTT"),
("CTTTTCAGTGGTTGATGCTCAAGATG"),
("GCCTGACGAAGTTAAAACTGATATTGA")
;

Query OK, 12 rows affected (0,00 sec)
Records: 12 Duplicates: 0 Warnings: 0


mysql> set
@seq:="CACTGCTTCTCACTGTTGCAGTTTTTCCCACACATCCAGCAGTGAAAGTCCACTGTAACTTCAGCATAAT
CTGTTGGCATGTGAATTTGTTTTCCATTTTCCTTGTTTGACTGACTGGCTATGTCTTCACTTTTTTCATTTTTTTTG
ACTCTTGAGCCATAGTTCGTAAAATTGCTCCATATCTTGTATCCCATTCTCCTTCATGTAAGTCCAAACTTCTCTTT
CCTCAGGACTATGAGCAAAGGAACAGTTTCCAACATATTGACATTTTTTCCCAGAAGCAATATGGTTGCACAGATCA
AACTGTAAAGGCATTTGTTTCTTTGTGGGGAGAGGACGGATGTTCATCCACTTCTTACGTTCAATAGACATCACTCT
CATCGCACGCCGGTCTTTGGTCCACGAATGCCTTGCTTTTGCACTACAATATTTTCTGTTTTTGTCTGGTTCAATGA
CTTGACCGTTTCTCAGACACTGGGCGCACACAAACTTTATCTTCATATTAAGAAATCCAGGCATTATTTGATTACCA
AGTACCTGCGCTCCAGGTACATTTGCTTCCAAATTCTGCCAATATCGTTTAGACTCTTGAGCAATAGCATCATGTGA
GATACCTGTTTCATTTTGCCG";
Query OK, 0 rows affected (0,00 sec)

mysql> select
concat("ID.",Forward.id) as "Forward" ,
locate(Forward.primer,@seq) as "Start",
concat("ID.",Revers.id) as "Reverse",
locate(revcomp(Revers.primer),@seq)+length(Revers.primer) as "End",
repeat('*',(locate(revcomp(Revers.primer),@seq)+length(Revers.primer)-locate(Forward.primer,@seq) )/16) as "schema"
from
primers as Forward,
primers as Revers
where
locate(Forward.primer,@seq)>0 and
locate(revcomp(Revers.primer),@seq) > locate(Forward.primer,@seq)
group by 1,3 order by 5,2;

+---------+-------+---------+------+----------------------------------------+
| Forward | Start | Reverse | End | schema |
+---------+-------+---------+------+----------------------------------------+
| ID.10 | 609 | ID.1 | 628 | * |
| ID.7 | 11 | ID.6 | 46 | ** |
| ID.7 | 11 | ID.5 | 111 | ****** |
| ID.9 | 528 | ID.1 | 628 | ****** |
| ID.8 | 227 | ID.3 | 348 | ******* |
| ID.7 | 11 | ID.4 | 239 | ************** |
| ID.8 | 227 | ID.2 | 530 | ****************** |
| ID.7 | 11 | ID.3 | 348 | ********************* |
| ID.8 | 227 | ID.1 | 628 | ************************* |
| ID.7 | 11 | ID.2 | 530 | ******************************** |
| ID.7 | 11 | ID.1 | 628 | ************************************** |
+---------+-------+---------+------+----------------------------------------+
11 rows in set (0,00 sec)


That's it.

Pierre

2 comments:

Tim Yates said...

Cool! I had to write one of these a few months ago, but I went the "stored procedure" route... I reckon yours is faster... That's something I'm going to have to investigate!! :-D

Here's the function I wrote for completeness... (obviously, blogger doesn't accept PRE tags, so the formatting will be totally messed up) :-(

DELIMITER $$
DROP FUNCTION IF EXISTS `xmap_reverseComplement`$$
CREATE FUNCTION `xmap_reverseComplement` ( sequence TEXT, shouldPerform TINYINT(1) ) RETURNS TEXT DETERMINISTIC NO SQL
BEGIN
-- This function returns the reverse complement of 'sequence' IF 'shouldPerform' is non-zero
-- Otherwise, just returns the original sequence. Unknown bases are ignored, and not translated
DECLARE ret TEXT DEFAULT '' ;
DECLARE idx INT DEFAULT 1 ;
DECLARE current CHAR( 1 ) ;
IF shouldPerform <> 0 THEN
SET sequence = REVERSE( sequence ) ;
WHILE idx <= CHAR_LENGTH( sequence ) DO
SET current = SUBSTRING( sequence, idx, 1 ) ;
IF current = 'T' OR current = 't' THEN
SET ret = CONCAT( ret, 'A' ) ;
ELSEIF current = 'G' OR current = 'g' THEN
SET ret = CONCAT( ret, 'C' ) ;
ELSEIF current = 'C' OR current = 'c' THEN
SET ret = CONCAT( ret, 'G' ) ;
ELSEIF current = 'A' OR current = 'a' THEN
SET ret = CONCAT( ret, 'T' ) ;
ELSE
SET ret = CONCAT( ret, current ) ;
END IF ;
SET idx = idx + 1 ;
END WHILE ;
ELSE
SET ret = sequence ;
END IF ;
RETURN ret ;
END $$
DELIMITER ;

(obviously needs MySQL 5+, and isn't as complete as yours as it just handles DNA bases...)

Anonymous said...

Hi,

the MySQL server crashes, if you pass an empty string.

Add a

if (size == 0) {
*is_null=0;
*error=0;
return initid->ptr;
}

below of "*length=size;".

I also added translation from 'U|u' to 'A|a'.