09 September 2011

Customizing "C printf" to print a FASTA sequence.

The GNU C library lets you define your own custom conversion specifiers for printf template strings. In the current post, I will show how to add a new modifier for printf to print a fasta sequence. Say we have a C structure holding a FASTA sequence:

typedef struct fasta_t
	{
	char* name;
	char* sequence;
	}Fasta;
The new output conversion is registered with register_printf_specifier.
register_printf_specifier(
               'W',/* the new modifier */
		/* handler function */
		printf_fasta_handler,
		/* arginfo function */
		printf_fasta_arginfo )

The handler function

"The handler-function is the function called by printf and friends when this conversion appears in a template string... The structure printf_info is used to pass information about the options appearing in an instance of a conversion specifier in a printf template string to the handler and arginfo functions for that specifier.". Here the width is used to restrict the length of the fasta sequence and the precision defines the length of the lines.
#define FPUTC(C) ret+=(fputc(C,stream)==EOF?0:1)
/** handler for printf called by register_printf_function */
static int printf_fasta_handler(
	FILE *stream, /*  stream output */
	const struct printf_info *info, /* information about the various options */
         const void *const *args /* arguments */
	)
	{
	int n=0;
	int ret=0;
	char* p;

	int lline =(info->prec<1?60:info->prec);
	const Fasta* fasta=*((const Fasta**)(args[0]));
	
	if(fasta->name!=NULL)
		{
		FPUTC('>');
		ret+=fputs(fasta->name,stream);
		}
	else
		{
		FPUTC('>');
		}
	
	if(fasta->sequence!=NULL)
		{
		p=(char*)fasta->sequence;
		while(p[n]!=0 && (info->width<1 || n< info->width))
			{
			if(n%lline==0) 
				{
				FPUTC('\n');
				}
			FPUTC(p[n]);
			++n;
			}
		}
	FPUTC('\n');
       	/* returns the number of characters writtern */
	return ret;
	}

The arginfo function

"The arginfo-function is the function called by parse_printf_format when this conversion appears in a template string."
static int printf_fasta_arginfo(
	const struct printf_info *info,
	size_t n,
	int *argtypes,
	int *size)
	{
	 /* We always take exactly one argument and this is a pointer to the
          structure.. */
	 if (n > 0) argtypes[0] = PA_POINTER;
	return 1;
	}

Printing the FASTA sequence:


Fasta record;
(...)
printf("%80W\n",&record);
printf("%200.90W\n",&record);
printf("%200W\n",&record);
printf("%.80W\n",&record);

Compile and run

$ gcc -Wall source.c
$ ./a.out < sequence.fa

>gi|27592135
GGAAGGGCTGCCCCACCATTCATCCTTTTCTCGTAGTTTGTGCACGGTGCGGGAGGTTGT
CTGAGTGACTTCACGGGTCG

>gi|27592135
GGAAGGGCTGCCCCACCATTCATCCTTTTCTCGTAGTTTGTGCACGGTGCGGGAGGTTGTCTGAGTGACTTCACGGGTCGCCTTTGTGCA
GTACTAGATATGCAGCAGACCTATGACATGTGGCTAAAGAAACACAATCCTGGGAAGCCTGGAGAGGGAACACCACTCACTTCGCGAGAA
GGGGAGAAACAGATCCAGAT

>gi|27592135
GGAAGGGCTGCCCCACCATTCATCCTTTTCTCGTAGTTTGTGCACGGTGCGGGAGGTTGT
CTGAGTGACTTCACGGGTCGCCTTTGTGCAGTACTAGATATGCAGCAGACCTATGACATG
TGGCTAAAGAAACACAATCCTGGGAAGCCTGGAGAGGGAACACCACTCACTTCGCGAGAA
GGGGAGAAACAGATCCAGAT

>gi|27592135
GGAAGGGCTGCCCCACCATTCATCCTTTTCTCGTAGTTTGTGCACGGTGCGGGAGGTTGTCTGAGTGACTTCACGGGTCG
CCTTTGTGCAGTACTAGATATGCAGCAGACCTATGACATGTGGCTAAAGAAACACAATCCTGGGAAGCCTGGAGAGGGAA
CACCACTCACTTCGCGAGAAGGGGAGAAACAGATCCAGATGCCCACTGACTATGCTGACATCATGATGGGCTACCACTGC
TGGCTCTGCGGGAAGAACAGCAACAGCAAGAAGCAATGGCAGCAGCACATCCAGTCAGAGAAGCACAAGGAGAAGGTCTT
CACCTCAGACAGTGACTCCAGCTGCTGGAGCTATCGCTTCCCTATGGGCGAGTTCCAGCTCTGTGAAAGGTACCATGCAC
ATGGCTCTGTTTGATCCCAGAAGTGATGACTACTTAGTGGTAAAAACACATTTCCAGACACACAACTTCAGAAAATGAGT
GCAAGCTTCAAGTCTGCCCTTTGTAGCCATAATGTGCTCAGCTCTCGGTCTGCTGAACAGAGTCTACTTGGCTCAATTCT
TGGGGGAATCCCAGATGCTTTATTAGATTGTTTGAATGTCTCACGCCCTCTGAATCAGTGCCTTGAGGTGCCTTCAGAAG
GCTTGTGATGGTTAGNNNTNGCATTTTGGTT

Code


That's it,

Pierre

No comments: