Short post: Plain text vs Binary data.
This week, I was asked the difference between storing some data using a plain text format and a binary format. I wrote the following code C++ to illustrate how to store some genotypes in a ( possibly huge )table saved as a binary file. The first bytes of this file tells the number of individuals, the number of markers, the name of the individuals. Then for each marker, we get the name of the marker, its position and an array of genotypes for each individual.
The first invocation of this program (-p write ) creates a random table and a second call (-p read ) answers the genotypes for some random individuals/markers.
/** | |
naive program showing how to manage a (large) linkage file with a binary file | |
Pierre Lindenbaum | |
*/ | |
#include <cstdio> | |
#include <cstdlib> | |
#include <cstring> | |
#include <vector> | |
#include <iostream> | |
using namespace std; | |
#define MAX_NAME 50 | |
#define MAX_INDIVIDUAL_NAME MAX_NAME | |
#define MAX_SNP_NAME MAX_NAME | |
/** gender */ | |
enum gender | |
{ | |
unknown=0, | |
male=1, | |
female=2 | |
}; | |
/** A SNP mapped on the genome */ | |
typedef struct marker_t | |
{ | |
/** rs Name */ | |
char name[MAX_SNP_NAME]; | |
/** Y index in the table */ | |
int rowIndex; | |
/** chromosome identifier */ | |
int chromosomeId; | |
/** position in the chromosome */ | |
int position; | |
}Marker,*MarkerPtr; | |
/** A Individual genotyped */ | |
typedef struct individual_t | |
{ | |
/** X column index in the table */ | |
int columnIndex; | |
/** his name */ | |
char name[MAX_INDIVIDUAL_NAME]; | |
/** index of the father of -1 if unknown */ | |
int fatherIndex; | |
/** index of the mother of -1 if unknown */ | |
int motherIndex; | |
/** George Michael wants it */ | |
gender sex; | |
}Individual,*IndividualPtr; | |
/** Alleles , just two bases */ | |
typedef struct alleles_t | |
{ | |
char alleles[2]; | |
}Alleles,*AllelesPtr; | |
/** | |
a class handling a 'linkage' binary file | |
X axis are the individuals (in memory) | |
Y axis are the SNP | |
values are the Marker | |
both Marker and Alleles are accessed using a random access | |
structure of the file | |
1 int = count(individuals) | |
1 int = count(markers) | |
count(individuals)*Individuals = Individuals | |
count(markers)*( Marker + alleles[count(individuals)]) = markers+genotypes | |
*/ | |
class Linkage | |
{ | |
private: | |
/** FILE pointer to the binary file */ | |
FILE* io_ptr; | |
/** All individuals in the pedigree */ | |
vector<IndividualPtr> individuals; | |
/** number of markers in the binary file */ | |
int count_markers; | |
/** last SNP fetched ,in memory, avoid to call fseek each time for the same marker */ | |
MarkerPtr current_snp; | |
/** last row of genotypes fetched, used in memory, avoid to call fseek each time for the same marker */ | |
AllelesPtr current_alleles; | |
private: | |
/** fetch the rowIndex-th SNP and the rowIndex-th array of Alleles */ | |
void _fetchRow(int rowIndex) | |
{ | |
//current SNP is already in memory | |
if(current_snp->rowIndex==rowIndex) return; | |
//change the file position | |
fseek( io_ptr, (2*sizeof(int) + //nbr individual + nbr markers | |
sizeof(Individual)*individuals.size()+ //individual names | |
(sizeof(Alleles)*individuals.size()+sizeof(Marker))*rowIndex //previous rows (Marker+alleles[]) | |
), | |
SEEK_SET | |
); | |
//read the row: | |
fread(current_snp,sizeof(Marker),1,io_ptr);//read the current_snp | |
fread(current_alleles,sizeof(Alleles),individuals.size(),io_ptr);//read the array of alleles | |
} | |
public: | |
Linkage():io_ptr(NULL),count_markers(-1),current_snp(NULL),current_alleles(NULL) | |
{ | |
} | |
~Linkage() | |
{ | |
close(); | |
} | |
void open(const char* filename,bool read_only) | |
{ | |
//close if it was already open | |
close(); | |
//open as binary | |
io_ptr=fopen(filename,(read_only?"rb":"wb")); | |
if(io_ptr==NULL) throw "cannot open file"; | |
if(read_only) | |
{ | |
int n_individuals=0; | |
//read the number of individuals | |
fread(&n_individuals,sizeof(int),1,io_ptr); | |
//alloc the buffer for current_snp | |
current_snp=new Marker; | |
current_snp->rowIndex=-1; | |
//alloc the buffer for current_allele | |
current_alleles=new Alleles[n_individuals]; | |
//read the number of SNP | |
fread(&count_markers,sizeof(int),1,io_ptr); | |
//read the definition of the individuals | |
for(int i=0;i< n_individuals;++i) | |
{ | |
IndividualPtr indi=new Individual; | |
fread(indi,sizeof(Individual),1,io_ptr); | |
//add it into the pedigree | |
individuals.push_back(indi); | |
} | |
} | |
} | |
void close() | |
{ | |
//clean individuals | |
while(!individuals.empty()) | |
{ | |
delete individuals.back(); | |
individuals.pop_back(); | |
} | |
//close file | |
if(io_ptr!=NULL) fclose(io_ptr); | |
io_ptr=NULL; | |
//clean current_snp and current_alleles | |
if(current_snp!=NULL) delete current_snp; | |
current_snp=NULL; | |
if(current_alleles!=NULL) delete [] current_alleles; | |
current_alleles=NULL; | |
} | |
//create a random binary file | |
void random() | |
{ | |
//create a pedigree | |
while(individuals.size()<100) | |
{ | |
//create a new individual | |
IndividualPtr newindi=new Individual; | |
newindi->columnIndex=individuals.size(); | |
newindi->fatherIndex=-1; | |
newindi->motherIndex=-1; | |
snprintf(newindi->name,MAX_INDIVIDUAL_NAME,"IndiId.%03d",individuals.size()); | |
//add this new individual | |
individuals.push_back(newindi); | |
} | |
int n=individuals.size(); | |
int n_markers=100000; | |
//save the number of individuals | |
fwrite(&n,sizeof(int),1,io_ptr); | |
//save the number of snps | |
fwrite(&n_markers,sizeof(int),1,io_ptr); | |
//write the individuals | |
for(int i=0; i< (int)individuals.size();++i) | |
{ | |
fwrite(individuals[i],sizeof(Individual),1,io_ptr); | |
} | |
//write all the SNPs | |
for(int i=0;i< n_markers;++i) | |
{ | |
Marker snp; | |
snprintf(snp.name,MAX_SNP_NAME,"rs%d",(i+1)); | |
snp.chromosomeId=12; | |
snp.position=i*2; | |
snp.rowIndex=i; | |
//write this snp | |
fwrite(&snp,sizeof(Marker),1,io_ptr); | |
//for each SNP , we add an array of Alleles[individuals.size()] | |
for(int j=0;j< (int)individuals.size();++j) | |
{ | |
Alleles a; | |
a.alleles[0]=(j%3==0?'A':(i%2==0?'A':'T')); | |
a.alleles[1]=(i%13==0?'C':'T');; | |
fwrite(&a,sizeof(Alleles),1,io_ptr); | |
} | |
} | |
//flush | |
fflush(io_ptr); | |
} | |
//get the marker at a given Y index | |
const MarkerPtr getMarker(int marker_index) | |
{ | |
_fetchRow(marker_index); | |
return current_snp; | |
} | |
//get the genotype for the given individual and the given snp | |
const AllelesPtr get(const IndividualPtr indi,const MarkerPtr snp) | |
{ | |
_fetchRow(snp->rowIndex); | |
return ¤t_alleles[indi->columnIndex]; | |
} | |
void test() | |
{ | |
//loop over the markers | |
for(int i=0;i< this->count_markers;i+=7) | |
{ | |
//get the i-th marker | |
const MarkerPtr snp=getMarker(i); | |
//loop over the individual | |
for(int j=0;j< (int)individuals.size();j+=10) | |
{ | |
const IndividualPtr indi =individuals[j]; | |
//get the genotype for this(individual/snp) | |
const AllelesPtr var= get(indi,snp); | |
cout << snp->name << " for " << indi->name << " is (" << var->alleles[0]<<"/"<< var->alleles[1] <<")" << endl; | |
} | |
} | |
} | |
}; | |
int main(int argc,char **argv) | |
{ | |
Linkage linkage; | |
char* filename=NULL; | |
char* program=NULL; | |
int optind=1; | |
while(optind<argc) | |
{ | |
if(strcmp(argv[optind],"-h")==0) | |
{ | |
std::cerr << "Usage: "<< argv[0] << "[options]" << endl | |
<<" -f <filename>" << endl | |
<<" -p <program= read|write>" << endl | |
; | |
return EXIT_SUCCESS; | |
} | |
else if(strcmp(argv[optind],"-f")==0 && optind+1<argc) | |
{ | |
filename=argv[++optind]; | |
} | |
else if(strcmp(argv[optind],"-p")==0 && optind+1<argc) | |
{ | |
program=argv[++optind]; | |
} | |
else if(strcmp(argv[optind],"--")==0) | |
{ | |
++optind; | |
break; | |
} | |
else if(argv[optind][0]=='-') | |
{ | |
std::cerr << "unknown option "<< argv[optind] << endl; | |
return EXIT_FAILURE; | |
} | |
else | |
{ | |
break; | |
} | |
++optind; | |
} | |
if(optind!=argc) | |
{ | |
std::cerr << "illegal number of args"<< endl; | |
return EXIT_FAILURE; | |
} | |
if(filename==NULL) | |
{ | |
std::cerr << "filename not defined"<< endl; | |
return EXIT_FAILURE; | |
} | |
if(program==NULL) | |
{ | |
std::cerr << "program not defined"<< endl; | |
return EXIT_FAILURE; | |
} | |
else if(strcmp(program,"write")==0) | |
{ | |
linkage.open(filename,false); | |
linkage.random(); | |
} | |
else if(strcmp(program,"read")==0) | |
{ | |
linkage.open(filename,true); | |
linkage.test(); | |
} | |
else | |
{ | |
std::cerr << "unknown program"; | |
return EXIT_FAILURE; | |
} | |
return EXIT_SUCCESS; | |
} |
all: | |
g++ -Wall binlinkage.cpp | |
./a.out -f linkage.bin -p write | |
./a.out -f linkage.bin -p read | |
rm linkage.bin |
That's it.
Pierre