Generating a WIG file containing the coverage data of a BAM file
The following (experimental) C-code generates a 'Wiggle Track Format' (WIG) file containing the coverage/depth data extracted from a BAM file.
It uses the C API of samtools to loop over the mapped short reads of the BAM file, and for each short-read, it scans the CIGAR description and increase the number of hits seen on the chromosome at a given position.
It takes as an optional UCSC knownGene.txt file to restrict the output to the known translated regions.
The C code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Author: | |
Pierre Lindenbaum PhD | |
WWW: | |
http://plindenbaum.blogspot.com | |
contact: | |
plindenbaum@yahoo.fr | |
Motivation: | |
creates a WIG file from a BAM file | |
Compilation: | |
gcc -m64 -I path/to/samtools-0.1.7a -L path/to/samtools-0.1.7a bam2wig.c -lbam -lz | |
See also: | |
API: http://samtools.sourceforge.net/samtools/sam/ | |
*/ | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <assert.h> | |
#include <limits.h> | |
#include "bam.h" | |
#include "sam.h" | |
typedef short depth_t; | |
#define LOG(a) a | |
#define WHERE fprintf(stderr,"%s:%s,%d\n",__FILE__,__FUNCTION__,__LINE__) | |
static const depth_t INIT_IGNORE_THIS_BASE=-1; | |
static const depth_t INIT_DEFAULT=0; | |
typedef struct Reference_t | |
{ | |
depth_t init; | |
depth_t* data; | |
int length; | |
int buffer_size; | |
}Reference,*ReferencePtr; | |
static ReferencePtr referenceNew(depth_t init) | |
{ | |
ReferencePtr ptr= calloc(1,sizeof(Reference)); | |
if(ptr==NULL) | |
{ | |
fprintf(stderr,"Out of memory\n"); | |
exit(EXIT_FAILURE); | |
} | |
ptr->init=init; | |
ptr->buffer_size=100000000; | |
ptr->data=malloc(ptr->buffer_size*sizeof(depth_t)); | |
if(ptr->data==NULL) | |
{ | |
fprintf(stderr,"Out of memory\n"); | |
exit(EXIT_FAILURE); | |
} | |
memset(ptr->data,0,(ptr->buffer_size*sizeof(depth_t))); | |
ptr->length=0; | |
return ptr; | |
} | |
void referenceFree(ReferencePtr ptr) | |
{ | |
free(ptr->data); | |
free(ptr); | |
} | |
void referenceEnsureIndex(ReferencePtr ptr,int pos) | |
{ | |
if(pos>= ptr->buffer_size) | |
{ | |
int new_buffer_size=pos+1000000; | |
LOG(fprintf(stderr,"[LOG]Realloc to %d\n",new_buffer_size)); | |
ptr->data=realloc(ptr->data,new_buffer_size*sizeof(depth_t)); | |
if(ptr->data==NULL) | |
{ | |
fprintf(stderr,"Out of memory\n"); | |
exit(EXIT_FAILURE); | |
} | |
ptr->buffer_size=new_buffer_size; | |
} | |
while( ptr->length <= pos ) | |
{ | |
ptr->data[ptr->length]=ptr->init; | |
ptr->length++; | |
} | |
} | |
void referenceReadWasSeen(ReferencePtr ptr,int pos) | |
{ | |
referenceEnsureIndex(ptr,pos); | |
if( ptr->data[pos]==INIT_IGNORE_THIS_BASE) return; | |
if( ptr->data[pos]==SHRT_MAX ) return; | |
ptr->data[pos]++; | |
} | |
void referenceDump(const ReferencePtr ptr,const char* name,FILE* out) | |
{ | |
int pos=0; | |
while(pos< ptr->length) | |
{ | |
//skip empty | |
while(pos< ptr->length && ptr->data[pos]==ptr->init) | |
{ | |
++pos; | |
} | |
if(pos>= ptr->length) break; | |
fprintf(out,"fixedStep chrom=%s start=%d step=1 span=1\n",name,pos); | |
while(pos< ptr->length && ptr->data[pos]!=ptr->init) | |
{ | |
fprintf(out,"%d\n",ptr->data[pos]); | |
++pos; | |
} | |
} | |
} | |
static char* readline(FILE* in,int *len) | |
{ | |
int c; | |
for(;;) | |
{ | |
char* buff=NULL; | |
int buffer_size=BUFSIZ; | |
*len=0; | |
if(feof(in)) return NULL; | |
buff=malloc(buffer_size); | |
if(buff==NULL) | |
{ | |
fprintf(stderr,"Out of memory\n"); | |
exit(EXIT_FAILURE); | |
} | |
buff[0]=0; | |
while((c=fgetc(in))!=EOF) | |
{ | |
if(c=='\n') break; | |
if(*len+2>=buffer_size) | |
{ | |
buff=realloc(buff,buffer_size+BUFSIZ); | |
if(buff==NULL) | |
{ | |
fprintf(stderr,"Out of memory\n"); | |
exit(EXIT_FAILURE); | |
} | |
buffer_size+=BUFSIZ; | |
} | |
buff[*len]=c; | |
(*len)++; | |
buff[*len]=0; | |
} | |
if(buff[0]!=0) | |
{ | |
return buff; | |
} | |
free(buff); | |
buff=NULL; | |
} | |
} | |
int main(int argc, char *args[]) | |
{ | |
samfile_t *fp_in = NULL; | |
bam1_t *b=NULL; | |
int optind=1; | |
char* knownGeneFile=NULL; | |
int reference_count=0; | |
int reference_index=0; | |
while(optind<argc) | |
{ | |
if(strcmp(args[optind],"-h")==0) | |
{ | |
printf("%s . Pierre Lindenbaum PhD. Compilation %s",args[0],__DATE__); | |
printf(" -k knownGene.txt (optional)"); | |
return 0; | |
} | |
else if(strcmp(args[optind],"-k")==0 && optind+1< argc) | |
{ | |
knownGeneFile=args[++optind]; | |
} | |
else if(args[optind][0]=='-' && args[optind][1]=='-') | |
{ | |
optind++; | |
break; | |
} | |
else if(args[optind][0]=='-') | |
{ | |
fprintf(stderr,"Unnown option: %s\n",args[optind]); | |
return EXIT_FAILURE; | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
if(optind+1!=argc) | |
{ | |
fprintf(stderr,"Illegal number of arguments\n"); | |
return EXIT_FAILURE; | |
} | |
fp_in = samopen(args[optind], "rb", 0); | |
if(NULL == fp_in) | |
{ | |
fprintf(stderr,"Could not open file %s\n",args[optind]); | |
return EXIT_FAILURE; | |
} | |
reference_count=fp_in->header-> n_targets; | |
samclose(fp_in); | |
fprintf(stdout,"track type=wiggle_0 name=__SED_THIS__ viewLimits=0:100\n"); | |
for( reference_index=0; | |
reference_index < reference_count; | |
++reference_index | |
) | |
{ | |
ReferencePtr chromosome; | |
fp_in = samopen(args[optind], "rb", 0); | |
chromosome=referenceNew(knownGeneFile==NULL?INIT_DEFAULT:INIT_IGNORE_THIS_BASE); | |
if(NULL == fp_in) | |
{ | |
fprintf(stderr,"Could not open file %s\n",args[optind]); | |
return EXIT_FAILURE; | |
} | |
LOG(fprintf(stderr,"[LOG]Reading Chromosome:%s\n",fp_in->header->target_name[reference_index])); | |
if(knownGeneFile!=NULL) | |
{ | |
int n=0; | |
char* line=NULL; | |
FILE* kgFile=fopen(knownGeneFile,"r"); | |
if(kgFile==NULL) | |
{ | |
fprintf(stderr,"Cannot open %s\n",knownGeneFile); | |
return EXIT_FAILURE; | |
} | |
while((line=readline(kgFile,&n))!=NULL) | |
{ | |
int i; | |
int tIdx=1; | |
char* tokens[12]; | |
if(line[0]==0 || strncmp(line,"name\t",5)==0) | |
{ | |
free(line); | |
continue; | |
} | |
tokens[0]=line; | |
for(i=0;i< n;++i) | |
{ | |
if(line[i]=='\t' && tIdx< 12) | |
{ | |
tokens[tIdx]=&line[i+1]; | |
line[i]=0; | |
++tIdx; | |
} | |
} | |
if(tIdx!=12) | |
{ | |
fprintf(stderr,"BOUM \"%s\" %d\n",line,tIdx); | |
exit(EXIT_FAILURE); | |
} | |
if(strcmp(tokens[1],fp_in->header->target_name[reference_index])==0) | |
{ | |
int cdsStart=atoi(tokens[5]); | |
int cdsEnd=atoi(tokens[6]); | |
char *start=tokens[8]; | |
char *end=tokens[9]; | |
while(*start!=0 && *end!=0) | |
{ | |
char *next_start=NULL; | |
char *next_end=NULL; | |
int x1=(int)strtol(start,&next_start,10); | |
int x2=(int)strtol(end,&next_end,10); | |
while(x1<x2) | |
{ | |
if(x1>=cdsStart && x1<cdsEnd) | |
{ | |
referenceEnsureIndex(chromosome,x1+1); | |
chromosome->data[x1+1]=0; | |
} | |
++x1; | |
} | |
if(*next_start==',') ++next_start; | |
if(*next_end==',') ++next_end; | |
if(*next_start==0 || *next_end==0) break; | |
start=next_start; | |
end=next_end; | |
} | |
} | |
free(line); | |
} | |
} | |
b = bam_init1(); | |
int pos=0; | |
while(samread(fp_in, b) > 0) | |
{ | |
if(b->core.tid != reference_index) | |
{ | |
bam_destroy1(b); | |
b = bam_init1(); | |
continue; | |
} | |
pos = b->core.pos; | |
uint32_t* cigar= bam1_cigar(b); | |
int k; | |
for( k=0;k< b->core.n_cigar;++k) | |
{ | |
int cop =cigar[k] & BAM_CIGAR_MASK; // operation | |
int cl = cigar[k] >> BAM_CIGAR_SHIFT; // length | |
switch(cop) | |
{ | |
case BAM_CMATCH: | |
{ | |
while(cl>0) | |
{ | |
referenceReadWasSeen(chromosome,pos); | |
pos++; | |
cl--; | |
} | |
break; | |
} | |
case BAM_CHARD_CLIP:break; /* ignore */ | |
case BAM_CSOFT_CLIP:break; /* ignore */ | |
case BAM_CPAD: //... | |
case BAM_CREF_SKIP://.... | |
case BAM_CDEL: | |
{ | |
/* Spans positions, No Coverage */ | |
pos+=cl; | |
break; | |
} | |
case BAM_CINS: /* cursor not moved on reference */ ;break; | |
default:printf("?");assert(0);break; | |
} | |
} | |
bam_destroy1(b); | |
b = bam_init1(); | |
} | |
bam_destroy1(b); | |
referenceDump(chromosome,fp_in->header->target_name[reference_index],stdout); | |
referenceFree(chromosome); | |
samclose(fp_in); | |
} | |
return 0; | |
} |
Test
This Makefile generates a random pair of FASTQ files for the human chr22, maps them with BWA , creates a BAM and generates the WIG file:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
PROXY=--proxy host:port | |
bwa.dir=${HOME}/package/bwa/bwa-0.5.7 | |
bwa.bin=${bwa.dir}/bwa | |
sam.dir=${HOME}/package/sam/samtools-0.1.7a | |
sam.bin=${sam.dir}/samtools | |
CFLAGS= -Wall -O3 #-m64 | |
LIMIT=1000004 | |
data.wig:bam2wig knownGenes.txt sorted1.bam | |
./bam2wig -k knownGenes.txt sorted1.bam | sed 's/__SED_THIS__/chr22Genes/' > $@ | |
bam2wig: ${HOME}/bam2wig.c | |
gcc ${CFLAGS} -o $@ -I ${sam.dir} -L ${sam.dir} $< -lbam -lz | |
sorted1.bam:aln.bam | |
${sam.bin} sort aln.bam sorted1 | |
${sam.bin} index sorted1.bam | |
aln.bam:aln.sam | |
${sam.bin} view -b -T chr22.fa aln.sam > $@ | |
aln.sam:aln1.sai aln2.sai reads_1.fastq reads_2.fastq | |
${bwa.bin} sampe chr22db aln1.sai aln2.sai reads_1.fastq reads_2.fastq > $@ | |
aln2.sai:chr22db.bwt reads_2.fastq | |
${bwa.bin} aln chr22db reads_2.fastq > $@ | |
aln1.sai:chr22db.bwt reads_1.fastq | |
${bwa.bin} aln chr22db reads_1.fastq > $@ | |
reads_1.fastq reads_2.fastq: | |
${sam.dir}/misc/wgsim chr22.fa reads_1.fastq reads_2.fastq > _rand.txt | |
chr22db.bwt:chr22.fa | |
${bwa.bin} index -p chr22db -a bwtsw chr22.fa | |
chr22.fa.fai:chr22.fa | |
${sam.bin} faidx chr22.fa | |
knownGenes.txt: | |
curl ${PROXY} "http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/knownGene.txt.gz" |\ | |
gunzip -c | grep chr22 > $@ | |
chr22.fa: | |
curl ${PROXY} "http://hgdownload.cse.ucsc.edu/goldenPath/hg18/chromosomes/chr22.fa.gz" |\ | |
gunzip -c > $@ | |
Result
The resulting WIG file was uploaded as a custom track in the UCSC genome browser:That's it,
Pierre
No comments:
Post a Comment