YOKOFAKUN: A mysql user defined function to get the reverse complement of a sequence

I was asked today to compare some experimental genotypes to the
theoretical data. Unfortunaly many genotypes were given on the
anti-parallele strand compared to their references. So I wrote a small
href="http://dev.mysql.com/doc/refman/5.0/en/adding-functions.html">mysql
user defined function (UDF) to implement a new function
called 'revcomp' in mysql used to return the reverse complement of
a DNA sequence. Written in 'C/C++', this kind of function can be used
to embed bioinformatics into mysql. The coding was straighforward as I
already wrote a UDF translating a cDNA to a proteic sequence in a href="http://plindenbaum.blogspot.com/2006/07/mysql-user-defined-function-udf-for.html">previous
post.

Here is the code.

#include <my_global.h>
#include <m_ctype.h>
#include <mysql.h>

#include <m_string.h>
/* this function is called by mysql to initialize it */
my_bool revcomp_init(UDF_INIT *initid, UDF_ARGS *args, char *message);
/* this function is called by mysql to dispose it */
void revcomp_deinit(UDF_INIT *initid);
/* the main function with get the reverse complement of a dna */
char *revcomp(UDF_INIT *initid, UDF_ARGS *args, char *result,
               unsigned long *length, char *is_null, char *error);

/* a trivial function returning the complementary base of an acid nucleic */
static char complement(char b)
 {
 switch(b)
  {
                case 'A': return 'T';
                case 'T': return 'A';
                case 'G': return 'C';
                case 'C': return 'G';

                case 'a': return 't';
                case 't': return 'a';
                case 'g': return 'c';
                case 'c': return 'g';

                case 'w': return 'w';
                case 'W': return 'W';

                case 's': return 's';
                case 'S': return 'S';

                case 'y': return 'r';
                case 'Y': return 'R';

                case 'r': return 'y';
                case 'R': return 'Y';

                case 'k': return 'm';
                case 'K': return 'M';

                case 'm': return 'k';
                case 'M': return 'K';

                case 'b': return 'v';
                case 'd': return 'h';
                case 'h': return 'd';
                case 'v': return 'b';


                case 'B': return 'V';
                case 'D': return 'H';
                case 'H': return 'D';
                case 'V': return 'B';

                case 'N': return 'N';
                case 'n': return 'n';

  }
 return '?';
 }


/** this function is called by mysql to initialize our revcomp function */
my_bool revcomp_init(
        UDF_INIT *initid,
        UDF_ARGS *args,
        char *message
        )
  {
 /* check we have one STRING argument */
  if (!(args->arg_count == 1 && args->arg_type[0] == STRING_RESULT ))
    {
    strncpy(message,"Bad parameter, expected a DNA",MYSQL_ERRMSG_SIZE);
    return 1;
    }
  initid->maybe_null=1;
  /* initid->ptr will be used to store the transformed sequence */
  initid->ptr= (char*)malloc(0);
  /* out of memory ? */
  if(initid->ptr==NULL)
        {
        strncpy(message,"Out Of Memory",MYSQL_ERRMSG_SIZE);
        return 1;
        }
  return 0;
  }

/** this function is called by mysql to dispose our revcomp function */
void revcomp_deinit(UDF_INIT *initid)
        {
 /* free the user ptr */
        if(initid->ptr!=NULL) free(initid->ptr);
        }

/** this is the function called by mysql to reverse-complement a DNA */
char *revcomp(UDF_INIT *initid, UDF_ARGS *args, char *result,
               unsigned long *length, char *is_null, char *error)
 {
  long i;
  /* the size of the input */
  long size= args->lengths[0];
  /* the DNA given as input */
  const char *dna=args->args[0];
  char *ptr=NULL;

  if (dna==NULL) // DNA is a null argument
   {
    *is_null=1;
    return NULL;
   }
/* the length of the returned string will be 'size' */
*length=size;


/** try to reallocate our memory to store the new transformed DNA sequence */
ptr= (char*)realloc(initid->ptr,sizeof(char)*(size));//no need (size+1) 

/* out of memory ? */
if(ptr==NULL)
        {
        *is_null=1;
        *error=1;
        strncpy(error,"Out Of Memory",MYSQL_ERRMSG_SIZE);
        return NULL;
        }
initid->ptr=ptr;
*is_null=0;        
*error=0;

/* build the reverse complement */
for(i=0;i< size;++i)
        {
 initid->ptr[i] = complement( dna[(size-1)-i] );
        }

/* return our pointer */
return initid->ptr;
}

And here is the Makefile for my machine...


/usr/lib/revcomp.so:revcomp.c
        gcc -fPIC -shared -I/usr/include/mysql -DDBUG_OFF -O3 -lmysqlclient -o $@ $<

The function was installed on mysql using the following statement:

>mysql create function revcomp returns string SONAME "revcomp.so";

TESTS:

mysql> select revcomp("GAATTC");
+-------------------+
| revcomp("GAATTC") |
+-------------------+
| GAATTC            |
+-------------------+
1 row in set (0,00 sec)

mysql> select revcomp(revcomp("AAATTTaaatttGC"));
+------------------------------------+
| revcomp(revcomp("AAATTTaaatttGC")) |
+------------------------------------+
| AAATTTaaatttGC                     |
+------------------------------------+
1 row in set (0,00 sec)

mysql> select revcomp("SWatgcatgAAATTTaaatttGC");
+------------------------------------+
| revcomp("SWatgcatgAAATTTaaatttGC") |
+------------------------------------+
| GCaaatttAAATTTcatgcatWS            |
+------------------------------------+
1 row in set (0,00 sec)

here I create a table of primers and I find the all sub-sequences that could be amplified.

mysql> create table primers(id int primary key auto_increment, primer 
varchar(100));
Query OK, 0 rows affected (0,02 sec)

mysql> desc primers;
+--------+--------------+------+-----+---------+----------------+
| Field  | Type         | Null | Key | Default | Extra          |
+--------+--------------+------+-----+---------+----------------+
| id     | int(11)      |      | PRI | NULL    | auto_increment |
| primer | varchar(100) | YES  |     | NULL    |                |
+--------+--------------+------+-----+---------+----------------+
2 rows in set (0,00 sec)


mysql> insert into primers(primer) values 
("CAAAATGAAACAGGT"),
("TAATCAAATAATGCCTGGATTTCTT"),
("TGAACATCCGTCCTCTCCCCACAAA"),
("TCATAGTCCTGAGGAAAGAGAA"),
("TCAAACAAGGAAAATGGAAAACAAATTCA"),
("TCACTGCTGGATGTGTGGGAAAAACTGCA"),
("CACTGTTGCAGTTTTTCCCACAC"),
("TCAGGACTATGAGCAAAGGAACA"),
("TACCAAGTACCTGCGCTCCAGGTACATTT"),
("AGATACCTGTTTCATTT"),
("CTTTTCAGTGGTTGATGCTCAAGATG"),
("GCCTGACGAAGTTAAAACTGATATTGA")
;

Query OK, 12 rows affected (0,00 sec)
Records: 12  Duplicates: 0  Warnings: 0


mysql> set 
@seq:="CACTGCTTCTCACTGTTGCAGTTTTTCCCACACATCCAGCAGTGAAAGTCCACTGTAACTTCAGCATAAT
CTGTTGGCATGTGAATTTGTTTTCCATTTTCCTTGTTTGACTGACTGGCTATGTCTTCACTTTTTTCATTTTTTTTG
ACTCTTGAGCCATAGTTCGTAAAATTGCTCCATATCTTGTATCCCATTCTCCTTCATGTAAGTCCAAACTTCTCTTT
CCTCAGGACTATGAGCAAAGGAACAGTTTCCAACATATTGACATTTTTTCCCAGAAGCAATATGGTTGCACAGATCA
AACTGTAAAGGCATTTGTTTCTTTGTGGGGAGAGGACGGATGTTCATCCACTTCTTACGTTCAATAGACATCACTCT
CATCGCACGCCGGTCTTTGGTCCACGAATGCCTTGCTTTTGCACTACAATATTTTCTGTTTTTGTCTGGTTCAATGA
CTTGACCGTTTCTCAGACACTGGGCGCACACAAACTTTATCTTCATATTAAGAAATCCAGGCATTATTTGATTACCA
AGTACCTGCGCTCCAGGTACATTTGCTTCCAAATTCTGCCAATATCGTTTAGACTCTTGAGCAATAGCATCATGTGA
GATACCTGTTTCATTTTGCCG"; 
Query OK, 0 rows affected (0,00 sec)

mysql> select 
concat("ID.",Forward.id) as "Forward" , 
locate(Forward.primer,@seq) as "Start",
concat("ID.",Revers.id) as "Reverse",
locate(revcomp(Revers.primer),@seq)+length(Revers.primer)  as "End",
repeat('*',(locate(revcomp(Revers.primer),@seq)+length(Revers.primer)-locate(Forward.primer,@seq) )/16) as "schema"
from
primers as Forward,
primers as Revers
where 
locate(Forward.primer,@seq)>0 and
locate(revcomp(Revers.primer),@seq) > locate(Forward.primer,@seq) 
group by 1,3 order by 5,2;

+---------+-------+---------+------+----------------------------------------+
| Forward | Start | Reverse | End  | schema                                 |
+---------+-------+---------+------+----------------------------------------+
| ID.10   |   609 | ID.1    |  628 | *                                      |
| ID.7    |    11 | ID.6    |   46 | **                                     |
| ID.7    |    11 | ID.5    |  111 | ******                                 |
| ID.9    |   528 | ID.1    |  628 | ******                                 |
| ID.8    |   227 | ID.3    |  348 | *******                                |
| ID.7    |    11 | ID.4    |  239 | **************                         |
| ID.8    |   227 | ID.2    |  530 | ******************                     |
| ID.7    |    11 | ID.3    |  348 | *********************                  |
| ID.8    |   227 | ID.1    |  628 | *************************              |
| ID.7    |    11 | ID.2    |  530 | ********************************       |
| ID.7    |    11 | ID.1    |  628 | ************************************** |
+---------+-------+---------+------+----------------------------------------+
11 rows in set (0,00 sec)

That's it.

Pierre

2 comments:

AnonymousTuesday, 25 September, 2007
Cool! I had to write one of these a few months ago, but I went the "stored procedure" route... I reckon yours is faster... That's something I'm going to have to investigate!! :-D

Here's the function I wrote for completeness... (obviously, blogger doesn't accept PRE tags, so the formatting will be totally messed up) :-(

DELIMITER $$
DROP FUNCTION IF EXISTS `xmap_reverseComplement`$$
CREATE FUNCTION `xmap_reverseComplement` ( sequence TEXT, shouldPerform TINYINT(1) ) RETURNS TEXT DETERMINISTIC NO SQL
BEGIN
-- This function returns the reverse complement of 'sequence' IF 'shouldPerform' is non-zero
-- Otherwise, just returns the original sequence. Unknown bases are ignored, and not translated
DECLARE ret TEXT DEFAULT '' ;
DECLARE idx INT DEFAULT 1 ;
DECLARE current CHAR( 1 ) ;
IF shouldPerform <> 0 THEN
SET sequence = REVERSE( sequence ) ;
WHILE idx <= CHAR_LENGTH( sequence ) DO
SET current = SUBSTRING( sequence, idx, 1 ) ;
IF current = 'T' OR current = 't' THEN
SET ret = CONCAT( ret, 'A' ) ;
ELSEIF current = 'G' OR current = 'g' THEN
SET ret = CONCAT( ret, 'C' ) ;
ELSEIF current = 'C' OR current = 'c' THEN
SET ret = CONCAT( ret, 'G' ) ;
ELSEIF current = 'A' OR current = 'a' THEN
SET ret = CONCAT( ret, 'T' ) ;
ELSE
SET ret = CONCAT( ret, current ) ;
END IF ;
SET idx = idx + 1 ;
END WHILE ;
ELSE
SET ret = sequence ;
END IF ;
RETURN ret ;
END $$
DELIMITER ;

(obviously needs MySQL 5+, and isn't as complete as yours as it just handles DNA bases...)
AnonymousThursday, 14 August, 2008
Hi,

the MySQL server crashes, if you pass an empty string.

Add a

if (size == 0) {
*is_null=0;
*error=0;
return initid->ptr;
}

below of "*length=size;".

I also added translation from 'U|u' to 'A|a'.

25 September 2007

A mysql user defined function to get the reverse complement of a sequence

2 comments: