Memory-Mapping the Human Genome with 'mmap': my notebook
In this post, I've explored how to use a memory-mapped file to handle the 3Go of the Human Genome sequence as if it was entirely loaded in memory.
Via wikipedia: "A memory-mapped file is a segment of virtual memory which has been assigned a direct byte-for-byte correlation with some portion of a file or file-like resource. This resource is typically a file that is physically present on-disk... In computing, mmap is a POSIX-compliant Unix system call that maps files or devices into memory. It is a method of memory-mapped file I/O. It naturally implements demand paging, because initially file contents are not entirely read from disk and do not use physical RAM at all. The actual reads from disk are performed in "lazy" manner, after a specific location is accessed."
Using a C++ program, I'm going to memory-map the fasta sequence of the human genome (indexed with samtools faidx) and search the position of some short-reads using a BoyerMoore algorithm:
That's it,
Pierre
Via wikipedia: "A memory-mapped file is a segment of virtual memory which has been assigned a direct byte-for-byte correlation with some portion of a file or file-like resource. This resource is typically a file that is physically present on-disk... In computing, mmap is a POSIX-compliant Unix system call that maps files or devices into memory. It is a method of memory-mapped file I/O. It naturally implements demand paging, because initially file contents are not entirely read from disk and do not use physical RAM at all. The actual reads from disk are performed in "lazy" manner, after a specific location is accessed."
Using a C++ program, I'm going to memory-map the fasta sequence of the human genome (indexed with samtools faidx) and search the position of some short-reads using a BoyerMoore algorithm:
Required members:
/* maps a chromosome to its samtools faidx index */
map<string,faidx1_t> name2index;
/* used to get the size of the file */
struct stat buf;
/* genome fasta file file descriptor */
int fd;
/* the mmap (memory mapped) pointer */
char *mapptr;
Opening the mmap
string faidx(fastaFile);
string line;
faidx.append(".fai");
/* open *.fai file */
ifstream in(faidx.c_str(),ios::in);
/* read indexes in the .fai file that was created with samtools */
while(getline(in,line,'\n'))
{
faidx1_t index;
//parse the faidx line...
//...
name2index.insert(make_pair(chrom,index));
}
/* close index file */
in.close();
/* get the whole size of the fasta file */
stat(fasta, &buf);
/* open the fasta file */
fd = open(fastaFile, O_RDONLY);
/* open a memory mapped file associated to this fasta file descriptor */
mapptr = (char*)mmap(0, buf.st_size, PROT_READ, MAP_SHARED, fd, 0);
It's a kind of MAGIC: Getting the base at index 'position-th' of chromosome 'chrom'
std::map<std::string,faidx1_t>::iterator r=name2index.find(chrom);
faidx1_t& index=r->second;
char base=at(&index,position);
(...)
/* returns the base at position 'index' for the chromosome indexed by faidx */
char at(const FaidxPtr faidx,int64_t index)
{
long pos= faidx->offset +
index / faidx->line_blen * faidx->line_len +
index % faidx->line_blen
;
/* here is the magic: no need to fseek/fread/ftell the file */
return toupper(mapptr[pos]);
}
Mapping the short reads
I've hacked a simple Boyer-Moore-Horspool algorithm from ttp://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm. Of course, you wouldn't use this algorithm to map your short reads for real :-) .Disposing the mmap
/* close memory mapped map */
if(mapptr!=NULL) munmap(mapptr,buf.st_size);
/* dispose fasta file descriptor */
if(fd!=-1) close(fd);
Compile & run
$ g++ -Wall testmmap.cpp -lz
$ ./a.out -g /path/tp/hg19.fa /path/to/my.fastq.gz
ATCATTTTCCTCCTAACAGATTAAAAATCAAGAAATATAAACCAGATGTAGCAG chr11 93065362
(...)
Source code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Author: | |
* Pierre Lindenbaum PhD | |
* Contact: | |
* plindenbaum@yahoo.fr | |
* WWW: | |
* http://plindenbaum.blogspot.com | |
* Reference: | |
* | |
* Motivation: | |
* Mapping a large fasta file in memory using mmap | |
* Usage: | |
* g++ -Wall testmmap.cpp -lz | |
* ./a.out -g /path/to/genome/indexed/with/samtools-faidx/hg.fasta file.fastq | |
*/ | |
#include <cstdio> | |
#include <cstdlib> | |
#include <cstring> | |
#include <climits> | |
#include <stdint.h> | |
#include <cassert> | |
#include <fstream> | |
#include <iostream> | |
#include <algorithm> | |
#include <map> | |
#include <cerrno> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <fcntl.h> | |
#include <sys/mman.h> | |
#include <zlib.h> | |
using namespace std; | |
/* it is the structure created by samtools faidx */ | |
typedef struct faidx1_t | |
{ | |
int32_t line_len, line_blen; | |
int64_t len; | |
uint64_t offset; | |
}faidx1_t,*FaidxPtr; | |
/** returns the anti parallele base */ | |
static const char antiparralele(char c) | |
{ | |
switch(c) | |
{ | |
case 'A': return 'T'; | |
case 'T': return 'A'; | |
case 'G': return 'C'; | |
case 'C': return 'G'; | |
default:return 'N'; | |
} | |
} | |
/** | |
* wrapper for a mmap, a fileid and some faidx indexes | |
*/ | |
class IndexedGenome | |
{ | |
private: | |
/* maps a chromosome to the samtools faidx index */ | |
map<string,faidx1_t> name2index; | |
/* used to get the size of the file */ | |
struct stat buf; | |
/* genome fasta file file descriptor */ | |
int fd; | |
/* the mmap (memory mapped) pointer */ | |
char *mapptr; | |
/** reads an fill a string */ | |
bool readline(gzFile in,string& line) | |
{ | |
if(gzeof(in)) return false; | |
line.clear(); | |
int c=-1; | |
while((c=gzgetc(in))!=EOF && c!='\n') line+=(char)c; | |
return true; | |
} | |
public: | |
/** constructor | |
* @param fasta: the path to the genomic fasta file indexed with samtools faidx | |
*/ | |
IndexedGenome(const char* fasta):fd(-1),mapptr(NULL) | |
{ | |
string faidx(fasta); | |
string line; | |
faidx.append(".fai"); | |
/* open *.fai file */ | |
ifstream in(faidx.c_str(),ios::in); | |
if(!in.is_open()) | |
{ | |
cerr << "cannot open " << faidx << endl; | |
exit(EXIT_FAILURE); | |
} | |
/* read indexes in fai file */ | |
while(getline(in,line,'\n')) | |
{ | |
if(line.empty()) continue; | |
const char* p=line.c_str(); | |
char* tab=(char*)strchr(p,'\t'); | |
if(tab==NULL) continue; | |
string chrom(p,tab-p); | |
++tab; | |
faidx1_t index; | |
if(sscanf(tab,"%ld\t%ld\t%d\t%d", | |
&index.len, &index.offset, &index.line_blen,&index.line_len | |
)!=4) | |
{ | |
cerr << "Cannot read index in "<< line << endl; | |
exit(EXIT_FAILURE); | |
} | |
/* insert in the map(chrom,faidx) */ | |
name2index.insert(make_pair(chrom,index)); | |
} | |
/* close index file */ | |
in.close(); | |
/* get the whole size of the fasta file */ | |
if(stat(fasta, &buf)!=0) | |
{ | |
perror("Cannot stat"); | |
exit(EXIT_FAILURE); | |
} | |
/* open the fasta file */ | |
fd = open(fasta, O_RDONLY); | |
if (fd == -1) | |
{ | |
perror("Error opening file for reading"); | |
exit(EXIT_FAILURE); | |
} | |
/* open a memory mapped file associated to this fasta file descriptor */ | |
mapptr = (char*)mmap(0, buf.st_size, PROT_READ, MAP_SHARED, fd, 0); | |
if (mapptr == MAP_FAILED) | |
{ | |
close(fd); | |
perror("Error mmapping the file"); | |
exit(EXIT_FAILURE); | |
} | |
} | |
/* destructor */ | |
~IndexedGenome() | |
{ | |
/* close memory mapped map */ | |
if(mapptr!=NULL && munmap(mapptr,buf.st_size) == -1) | |
{ | |
perror("Error un-mmapping the file"); | |
} | |
/* dispose fasta file descriptor */ | |
if(fd!=-1) close(fd); | |
} | |
/* return the base at position 'index' for the chromosome indexed by faidx */ | |
char at(const FaidxPtr faidx,int64_t index) | |
{ | |
long pos= faidx->offset + | |
index / faidx->line_blen * faidx->line_len + | |
index % faidx->line_blen | |
; | |
/* here is the magic: no need to fseek/fread/ftell the file */ | |
return toupper(mapptr[pos]); | |
} | |
private: | |
/* search a DNA string in the genome | |
* simple Boyer-Moore-Horspool_algorithm hacked from | |
* http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm | |
*/ | |
void boyerMoore(const string& search) | |
{ | |
size_t bad_char_skip[UCHAR_MAX + 1]; | |
if (search.empty()) return; | |
for (size_t scan = 0; scan <= UCHAR_MAX; scan++) | |
{ | |
bad_char_skip[scan] = search.size(); | |
} | |
size_t last = search.size() - 1; | |
for (size_t scan = 0; scan < last; scan++) | |
{ | |
bad_char_skip[(size_t)search[scan]] = last - scan; | |
} | |
/* loop over each chromosome */ | |
for(std::map<std::string,faidx1_t>::iterator r=name2index.begin(); | |
r!=name2index.end(); | |
++r) | |
{ | |
faidx1_t& index=r->second; | |
size_t haystack_index=0; | |
while (haystack_index+ search.size() <= (size_t)index.len) | |
{ | |
size_t scan=last; | |
while( at(&index,haystack_index+scan) == search[scan] && | |
scan>0) | |
{ | |
--scan; | |
} | |
if(scan==0) | |
{ | |
cout << search << "\t"<< r->first << "\t" << haystack_index << endl; | |
haystack_index+=search.size(); | |
} | |
else | |
{ | |
char c=at(&index,haystack_index+last); | |
haystack_index += bad_char_skip[c]; | |
} | |
} | |
} | |
} | |
public: | |
/** read a fastq file an searches the sequences in the genome */ | |
void mapReads(gzFile in) | |
{ | |
string name; | |
string seq; | |
string name2; | |
string qual; | |
for(;;) | |
{ | |
if(!readline(in,name)) break; | |
if(!readline(in,seq)) break; | |
if(!readline(in,name2)) break; | |
if(!readline(in,qual)) break; | |
while(!seq.empty() && seq[0]=='N') | |
{ | |
seq.erase(0,1); | |
} | |
while(!seq.empty() && seq[seq.size()-1]=='N') | |
{ | |
seq.erase(seq.size()-1,1); | |
} | |
if(seq.size()<10) continue; | |
std::transform(seq.begin(), seq.end(),seq.begin(), ::toupper); | |
//search | |
boyerMoore(seq); | |
//rev-comp and search | |
reverse(seq.begin(),seq.end()); | |
std::transform(seq.begin(), seq.end(),seq.begin(), ::antiparralele); | |
boyerMoore(seq); | |
} | |
} | |
}; | |
int main(int argc,char** argv) | |
{ | |
char* genomePath=NULL; | |
int optind=1; | |
while(optind < argc) | |
{ | |
if(strcmp(argv[optind],"-h")==0) | |
{ | |
fprintf(stderr,"%s: Pierre Lindenbaum PHD. 2010.\n",argv[0]); | |
fprintf(stderr,"Compilation: %s at %s.\n",__DATE__,__TIME__); | |
fprintf(stderr," -g <path/to/genome/indexed/with/samtools/faids> FASQ[stdin|files] \n"); | |
exit(EXIT_FAILURE); | |
} | |
else if(strcmp(argv[optind],"-g")==0 && optind+1<argc) | |
{ | |
genomePath=argv[++optind]; | |
} | |
else if(strcmp(argv[optind],"--")==0) | |
{ | |
++optind; | |
break; | |
} | |
else if(argv[optind][0]=='-') | |
{ | |
fprintf(stderr,"unknown option '%s'\n",argv[optind]); | |
exit(EXIT_FAILURE); | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
if(genomePath==NULL) | |
{ | |
cerr << "genome path missing"<< endl; | |
return EXIT_FAILURE; | |
} | |
/* open IndexedGenome */ | |
IndexedGenome* genome=new IndexedGenome(genomePath); | |
if(optind==argc) | |
{ | |
/* read fastqs from stdin */ | |
genome->mapReads(gzdopen(fileno(stdin), "r")); | |
} | |
else | |
{ | |
/** loop over each fastq file */ | |
while(optind< argc) | |
{ | |
char* filename=argv[optind++]; | |
errno=0; | |
gzFile in=gzopen(filename,"rb"); | |
if(in==NULL) | |
{ | |
cerr << argv[0] | |
<< ": Cannot open "<< filename | |
<< ":" << strerror(errno) | |
<< endl; | |
exit(EXIT_FAILURE); | |
} | |
genome->mapReads(in); | |
gzclose(in); | |
} | |
} | |
/* dispose genome */ | |
delete genome; | |
cerr << "Done." << endl; | |
return 0; | |
} |
That's it,
Pierre
2 comments:
Hey Pierre,
You have an awesome blog. How fast does this run? I'm finding that this method takes quite a while to run to completion.
Hi Nikhil, I'm sorry, I didn't noticed a long time, but as far as I remember, I only tested it on chr22.
Post a Comment