Short post: Plain text vs Binary data.
This week, I was asked the difference between storing some data using a plain text format and a binary format. I wrote the following code C++ to illustrate how to store some genotypes in a ( possibly huge )table saved as a binary file. The first bytes of this file tells the number of individuals, the number of markers, the name of the individuals. Then for each marker, we get the name of the marker, its position and an array of genotypes for each individual.
The first invocation of this program (-p write ) creates a random table and a second call (-p read ) answers the genotypes for some random individuals/markers.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
naive program showing how to manage a (large) linkage file with a binary file | |
Pierre Lindenbaum | |
*/ | |
#include <cstdio> | |
#include <cstdlib> | |
#include <cstring> | |
#include <vector> | |
#include <iostream> | |
using namespace std; | |
#define MAX_NAME 50 | |
#define MAX_INDIVIDUAL_NAME MAX_NAME | |
#define MAX_SNP_NAME MAX_NAME | |
/** gender */ | |
enum gender | |
{ | |
unknown=0, | |
male=1, | |
female=2 | |
}; | |
/** A SNP mapped on the genome */ | |
typedef struct marker_t | |
{ | |
/** rs Name */ | |
char name[MAX_SNP_NAME]; | |
/** Y index in the table */ | |
int rowIndex; | |
/** chromosome identifier */ | |
int chromosomeId; | |
/** position in the chromosome */ | |
int position; | |
}Marker,*MarkerPtr; | |
/** A Individual genotyped */ | |
typedef struct individual_t | |
{ | |
/** X column index in the table */ | |
int columnIndex; | |
/** his name */ | |
char name[MAX_INDIVIDUAL_NAME]; | |
/** index of the father of -1 if unknown */ | |
int fatherIndex; | |
/** index of the mother of -1 if unknown */ | |
int motherIndex; | |
/** George Michael wants it */ | |
gender sex; | |
}Individual,*IndividualPtr; | |
/** Alleles , just two bases */ | |
typedef struct alleles_t | |
{ | |
char alleles[2]; | |
}Alleles,*AllelesPtr; | |
/** | |
a class handling a 'linkage' binary file | |
X axis are the individuals (in memory) | |
Y axis are the SNP | |
values are the Marker | |
both Marker and Alleles are accessed using a random access | |
structure of the file | |
1 int = count(individuals) | |
1 int = count(markers) | |
count(individuals)*Individuals = Individuals | |
count(markers)*( Marker + alleles[count(individuals)]) = markers+genotypes | |
*/ | |
class Linkage | |
{ | |
private: | |
/** FILE pointer to the binary file */ | |
FILE* io_ptr; | |
/** All individuals in the pedigree */ | |
vector<IndividualPtr> individuals; | |
/** number of markers in the binary file */ | |
int count_markers; | |
/** last SNP fetched ,in memory, avoid to call fseek each time for the same marker */ | |
MarkerPtr current_snp; | |
/** last row of genotypes fetched, used in memory, avoid to call fseek each time for the same marker */ | |
AllelesPtr current_alleles; | |
private: | |
/** fetch the rowIndex-th SNP and the rowIndex-th array of Alleles */ | |
void _fetchRow(int rowIndex) | |
{ | |
//current SNP is already in memory | |
if(current_snp->rowIndex==rowIndex) return; | |
//change the file position | |
fseek( io_ptr, (2*sizeof(int) + //nbr individual + nbr markers | |
sizeof(Individual)*individuals.size()+ //individual names | |
(sizeof(Alleles)*individuals.size()+sizeof(Marker))*rowIndex //previous rows (Marker+alleles[]) | |
), | |
SEEK_SET | |
); | |
//read the row: | |
fread(current_snp,sizeof(Marker),1,io_ptr);//read the current_snp | |
fread(current_alleles,sizeof(Alleles),individuals.size(),io_ptr);//read the array of alleles | |
} | |
public: | |
Linkage():io_ptr(NULL),count_markers(-1),current_snp(NULL),current_alleles(NULL) | |
{ | |
} | |
~Linkage() | |
{ | |
close(); | |
} | |
void open(const char* filename,bool read_only) | |
{ | |
//close if it was already open | |
close(); | |
//open as binary | |
io_ptr=fopen(filename,(read_only?"rb":"wb")); | |
if(io_ptr==NULL) throw "cannot open file"; | |
if(read_only) | |
{ | |
int n_individuals=0; | |
//read the number of individuals | |
fread(&n_individuals,sizeof(int),1,io_ptr); | |
//alloc the buffer for current_snp | |
current_snp=new Marker; | |
current_snp->rowIndex=-1; | |
//alloc the buffer for current_allele | |
current_alleles=new Alleles[n_individuals]; | |
//read the number of SNP | |
fread(&count_markers,sizeof(int),1,io_ptr); | |
//read the definition of the individuals | |
for(int i=0;i< n_individuals;++i) | |
{ | |
IndividualPtr indi=new Individual; | |
fread(indi,sizeof(Individual),1,io_ptr); | |
//add it into the pedigree | |
individuals.push_back(indi); | |
} | |
} | |
} | |
void close() | |
{ | |
//clean individuals | |
while(!individuals.empty()) | |
{ | |
delete individuals.back(); | |
individuals.pop_back(); | |
} | |
//close file | |
if(io_ptr!=NULL) fclose(io_ptr); | |
io_ptr=NULL; | |
//clean current_snp and current_alleles | |
if(current_snp!=NULL) delete current_snp; | |
current_snp=NULL; | |
if(current_alleles!=NULL) delete [] current_alleles; | |
current_alleles=NULL; | |
} | |
//create a random binary file | |
void random() | |
{ | |
//create a pedigree | |
while(individuals.size()<100) | |
{ | |
//create a new individual | |
IndividualPtr newindi=new Individual; | |
newindi->columnIndex=individuals.size(); | |
newindi->fatherIndex=-1; | |
newindi->motherIndex=-1; | |
snprintf(newindi->name,MAX_INDIVIDUAL_NAME,"IndiId.%03d",individuals.size()); | |
//add this new individual | |
individuals.push_back(newindi); | |
} | |
int n=individuals.size(); | |
int n_markers=100000; | |
//save the number of individuals | |
fwrite(&n,sizeof(int),1,io_ptr); | |
//save the number of snps | |
fwrite(&n_markers,sizeof(int),1,io_ptr); | |
//write the individuals | |
for(int i=0; i< (int)individuals.size();++i) | |
{ | |
fwrite(individuals[i],sizeof(Individual),1,io_ptr); | |
} | |
//write all the SNPs | |
for(int i=0;i< n_markers;++i) | |
{ | |
Marker snp; | |
snprintf(snp.name,MAX_SNP_NAME,"rs%d",(i+1)); | |
snp.chromosomeId=12; | |
snp.position=i*2; | |
snp.rowIndex=i; | |
//write this snp | |
fwrite(&snp,sizeof(Marker),1,io_ptr); | |
//for each SNP , we add an array of Alleles[individuals.size()] | |
for(int j=0;j< (int)individuals.size();++j) | |
{ | |
Alleles a; | |
a.alleles[0]=(j%3==0?'A':(i%2==0?'A':'T')); | |
a.alleles[1]=(i%13==0?'C':'T');; | |
fwrite(&a,sizeof(Alleles),1,io_ptr); | |
} | |
} | |
//flush | |
fflush(io_ptr); | |
} | |
//get the marker at a given Y index | |
const MarkerPtr getMarker(int marker_index) | |
{ | |
_fetchRow(marker_index); | |
return current_snp; | |
} | |
//get the genotype for the given individual and the given snp | |
const AllelesPtr get(const IndividualPtr indi,const MarkerPtr snp) | |
{ | |
_fetchRow(snp->rowIndex); | |
return ¤t_alleles[indi->columnIndex]; | |
} | |
void test() | |
{ | |
//loop over the markers | |
for(int i=0;i< this->count_markers;i+=7) | |
{ | |
//get the i-th marker | |
const MarkerPtr snp=getMarker(i); | |
//loop over the individual | |
for(int j=0;j< (int)individuals.size();j+=10) | |
{ | |
const IndividualPtr indi =individuals[j]; | |
//get the genotype for this(individual/snp) | |
const AllelesPtr var= get(indi,snp); | |
cout << snp->name << " for " << indi->name << " is (" << var->alleles[0]<<"/"<< var->alleles[1] <<")" << endl; | |
} | |
} | |
} | |
}; | |
int main(int argc,char **argv) | |
{ | |
Linkage linkage; | |
char* filename=NULL; | |
char* program=NULL; | |
int optind=1; | |
while(optind<argc) | |
{ | |
if(strcmp(argv[optind],"-h")==0) | |
{ | |
std::cerr << "Usage: "<< argv[0] << "[options]" << endl | |
<<" -f <filename>" << endl | |
<<" -p <program= read|write>" << endl | |
; | |
return EXIT_SUCCESS; | |
} | |
else if(strcmp(argv[optind],"-f")==0 && optind+1<argc) | |
{ | |
filename=argv[++optind]; | |
} | |
else if(strcmp(argv[optind],"-p")==0 && optind+1<argc) | |
{ | |
program=argv[++optind]; | |
} | |
else if(strcmp(argv[optind],"--")==0) | |
{ | |
++optind; | |
break; | |
} | |
else if(argv[optind][0]=='-') | |
{ | |
std::cerr << "unknown option "<< argv[optind] << endl; | |
return EXIT_FAILURE; | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
if(optind!=argc) | |
{ | |
std::cerr << "illegal number of args"<< endl; | |
return EXIT_FAILURE; | |
} | |
if(filename==NULL) | |
{ | |
std::cerr << "filename not defined"<< endl; | |
return EXIT_FAILURE; | |
} | |
if(program==NULL) | |
{ | |
std::cerr << "program not defined"<< endl; | |
return EXIT_FAILURE; | |
} | |
else if(strcmp(program,"write")==0) | |
{ | |
linkage.open(filename,false); | |
linkage.random(); | |
} | |
else if(strcmp(program,"read")==0) | |
{ | |
linkage.open(filename,true); | |
linkage.test(); | |
} | |
else | |
{ | |
std::cerr << "unknown program"; | |
return EXIT_FAILURE; | |
} | |
return EXIT_SUCCESS; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all: | |
g++ -Wall binlinkage.cpp | |
./a.out -f linkage.bin -p write | |
./a.out -f linkage.bin -p read | |
rm linkage.bin |
That's it.
Pierre
2 comments:
While working on some rather large SNP datasets at DeCODE I needed a file format for it. One that was efficient to access directly on the disk, since I couldn't load all of it into RAM. The first attempt was very similar to yours, but I needed to extend it again and again with meta data about the individuals and the markers, so a student and I extended it with general meta data that can be stored directly in the files so we didn't have to worry about having the main marker data and the meta data go out of sync.
We had a number of small programs to manipulate these files, but that became too tedious an approach. We had to write a new program for each new kind of meta data. So we wrote a Python interface to the file format instead. That meant we had to add type information to the meta data which was a bit difficult to get right, but in the end we had a pretty flexible data format.
You can see the code here: http://bircwww.daimi.au.dk/~mailund/SNPfile/index.html
Thomas, I *would not* store some genotypes using the binary format I've described in this post: As you said, It would be too difficult to insert some (sorted) new data, to add some metadata , etc... That's why I would rather use something like BerkeleyDB to manage my genotypes.
Post a Comment