23 April 2010

Short post: Plain text vs Binary data.

This week, I was asked the difference between storing some data using a plain text format and a binary format. I wrote the following code C++ to illustrate how to store some genotypes in a ( possibly huge )table saved as a binary file. The first bytes of this file tells the number of individuals, the number of markers, the name of the individuals. Then for each marker, we get the name of the marker, its position and an array of genotypes for each individual.

The first invocation of this program (-p write ) creates a random table and a second call (-p read ) answers the genotypes for some random individuals/markers.

/**
naive program showing how to manage a (large) linkage file with a binary file
Pierre Lindenbaum
*/
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <vector>
#include <iostream>
using namespace std;
#define MAX_NAME 50
#define MAX_INDIVIDUAL_NAME MAX_NAME
#define MAX_SNP_NAME MAX_NAME
/** gender */
enum gender
{
unknown=0,
male=1,
female=2
};
/** A SNP mapped on the genome */
typedef struct marker_t
{
/** rs Name */
char name[MAX_SNP_NAME];
/** Y index in the table */
int rowIndex;
/** chromosome identifier */
int chromosomeId;
/** position in the chromosome */
int position;
}Marker,*MarkerPtr;
/** A Individual genotyped */
typedef struct individual_t
{
/** X column index in the table */
int columnIndex;
/** his name */
char name[MAX_INDIVIDUAL_NAME];
/** index of the father of -1 if unknown */
int fatherIndex;
/** index of the mother of -1 if unknown */
int motherIndex;
/** George Michael wants it */
gender sex;
}Individual,*IndividualPtr;
/** Alleles , just two bases */
typedef struct alleles_t
{
char alleles[2];
}Alleles,*AllelesPtr;
/**
a class handling a 'linkage' binary file
X axis are the individuals (in memory)
Y axis are the SNP
values are the Marker
both Marker and Alleles are accessed using a random access
structure of the file
1 int = count(individuals)
1 int = count(markers)
count(individuals)*Individuals = Individuals
count(markers)*( Marker + alleles[count(individuals)]) = markers+genotypes
*/
class Linkage
{
private:
/** FILE pointer to the binary file */
FILE* io_ptr;
/** All individuals in the pedigree */
vector<IndividualPtr> individuals;
/** number of markers in the binary file */
int count_markers;
/** last SNP fetched ,in memory, avoid to call fseek each time for the same marker */
MarkerPtr current_snp;
/** last row of genotypes fetched, used in memory, avoid to call fseek each time for the same marker */
AllelesPtr current_alleles;
private:
/** fetch the rowIndex-th SNP and the rowIndex-th array of Alleles */
void _fetchRow(int rowIndex)
{
//current SNP is already in memory
if(current_snp->rowIndex==rowIndex) return;
//change the file position
fseek( io_ptr, (2*sizeof(int) + //nbr individual + nbr markers
sizeof(Individual)*individuals.size()+ //individual names
(sizeof(Alleles)*individuals.size()+sizeof(Marker))*rowIndex //previous rows (Marker+alleles[])
),
SEEK_SET
);
//read the row:
fread(current_snp,sizeof(Marker),1,io_ptr);//read the current_snp
fread(current_alleles,sizeof(Alleles),individuals.size(),io_ptr);//read the array of alleles
}
public:
Linkage():io_ptr(NULL),count_markers(-1),current_snp(NULL),current_alleles(NULL)
{
}
~Linkage()
{
close();
}
void open(const char* filename,bool read_only)
{
//close if it was already open
close();
//open as binary
io_ptr=fopen(filename,(read_only?"rb":"wb"));
if(io_ptr==NULL) throw "cannot open file";
if(read_only)
{
int n_individuals=0;
//read the number of individuals
fread(&n_individuals,sizeof(int),1,io_ptr);
//alloc the buffer for current_snp
current_snp=new Marker;
current_snp->rowIndex=-1;
//alloc the buffer for current_allele
current_alleles=new Alleles[n_individuals];
//read the number of SNP
fread(&count_markers,sizeof(int),1,io_ptr);
//read the definition of the individuals
for(int i=0;i< n_individuals;++i)
{
IndividualPtr indi=new Individual;
fread(indi,sizeof(Individual),1,io_ptr);
//add it into the pedigree
individuals.push_back(indi);
}
}
}
void close()
{
//clean individuals
while(!individuals.empty())
{
delete individuals.back();
individuals.pop_back();
}
//close file
if(io_ptr!=NULL) fclose(io_ptr);
io_ptr=NULL;
//clean current_snp and current_alleles
if(current_snp!=NULL) delete current_snp;
current_snp=NULL;
if(current_alleles!=NULL) delete [] current_alleles;
current_alleles=NULL;
}
//create a random binary file
void random()
{
//create a pedigree
while(individuals.size()<100)
{
//create a new individual
IndividualPtr newindi=new Individual;
newindi->columnIndex=individuals.size();
newindi->fatherIndex=-1;
newindi->motherIndex=-1;
snprintf(newindi->name,MAX_INDIVIDUAL_NAME,"IndiId.%03d",individuals.size());
//add this new individual
individuals.push_back(newindi);
}
int n=individuals.size();
int n_markers=100000;
//save the number of individuals
fwrite(&n,sizeof(int),1,io_ptr);
//save the number of snps
fwrite(&n_markers,sizeof(int),1,io_ptr);
//write the individuals
for(int i=0; i< (int)individuals.size();++i)
{
fwrite(individuals[i],sizeof(Individual),1,io_ptr);
}
//write all the SNPs
for(int i=0;i< n_markers;++i)
{
Marker snp;
snprintf(snp.name,MAX_SNP_NAME,"rs%d",(i+1));
snp.chromosomeId=12;
snp.position=i*2;
snp.rowIndex=i;
//write this snp
fwrite(&snp,sizeof(Marker),1,io_ptr);
//for each SNP , we add an array of Alleles[individuals.size()]
for(int j=0;j< (int)individuals.size();++j)
{
Alleles a;
a.alleles[0]=(j%3==0?'A':(i%2==0?'A':'T'));
a.alleles[1]=(i%13==0?'C':'T');;
fwrite(&a,sizeof(Alleles),1,io_ptr);
}
}
//flush
fflush(io_ptr);
}
//get the marker at a given Y index
const MarkerPtr getMarker(int marker_index)
{
_fetchRow(marker_index);
return current_snp;
}
//get the genotype for the given individual and the given snp
const AllelesPtr get(const IndividualPtr indi,const MarkerPtr snp)
{
_fetchRow(snp->rowIndex);
return &current_alleles[indi->columnIndex];
}
void test()
{
//loop over the markers
for(int i=0;i< this->count_markers;i+=7)
{
//get the i-th marker
const MarkerPtr snp=getMarker(i);
//loop over the individual
for(int j=0;j< (int)individuals.size();j+=10)
{
const IndividualPtr indi =individuals[j];
//get the genotype for this(individual/snp)
const AllelesPtr var= get(indi,snp);
cout << snp->name << " for " << indi->name << " is (" << var->alleles[0]<<"/"<< var->alleles[1] <<")" << endl;
}
}
}
};
int main(int argc,char **argv)
{
Linkage linkage;
char* filename=NULL;
char* program=NULL;
int optind=1;
while(optind<argc)
{
if(strcmp(argv[optind],"-h")==0)
{
std::cerr << "Usage: "<< argv[0] << "[options]" << endl
<<" -f <filename>" << endl
<<" -p <program= read|write>" << endl
;
return EXIT_SUCCESS;
}
else if(strcmp(argv[optind],"-f")==0 && optind+1<argc)
{
filename=argv[++optind];
}
else if(strcmp(argv[optind],"-p")==0 && optind+1<argc)
{
program=argv[++optind];
}
else if(strcmp(argv[optind],"--")==0)
{
++optind;
break;
}
else if(argv[optind][0]=='-')
{
std::cerr << "unknown option "<< argv[optind] << endl;
return EXIT_FAILURE;
}
else
{
break;
}
++optind;
}
if(optind!=argc)
{
std::cerr << "illegal number of args"<< endl;
return EXIT_FAILURE;
}
if(filename==NULL)
{
std::cerr << "filename not defined"<< endl;
return EXIT_FAILURE;
}
if(program==NULL)
{
std::cerr << "program not defined"<< endl;
return EXIT_FAILURE;
}
else if(strcmp(program,"write")==0)
{
linkage.open(filename,false);
linkage.random();
}
else if(strcmp(program,"read")==0)
{
linkage.open(filename,true);
linkage.test();
}
else
{
std::cerr << "unknown program";
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
view raw binlinkage.cpp hosted with ❤ by GitHub
all:
g++ -Wall binlinkage.cpp
./a.out -f linkage.bin -p write
./a.out -f linkage.bin -p read
rm linkage.bin
view raw Makefile hosted with ❤ by GitHub


That's it.

Pierre

2 comments:

Mailund said...

While working on some rather large SNP datasets at DeCODE I needed a file format for it. One that was efficient to access directly on the disk, since I couldn't load all of it into RAM. The first attempt was very similar to yours, but I needed to extend it again and again with meta data about the individuals and the markers, so a student and I extended it with general meta data that can be stored directly in the files so we didn't have to worry about having the main marker data and the meta data go out of sync.

We had a number of small programs to manipulate these files, but that became too tedious an approach. We had to write a new program for each new kind of meta data. So we wrote a Python interface to the file format instead. That meant we had to add type information to the meta data which was a bit difficult to get right, but in the end we had a pretty flexible data format.

You can see the code here: http://bircwww.daimi.au.dk/~mailund/SNPfile/index.html

Pierre Lindenbaum said...

Thomas, I *would not* store some genotypes using the binary format I've described in this post: As you said, It would be too difficult to insert some (sorted) new data, to add some metadata , etc... That's why I would rather use something like BerkeleyDB to manage my genotypes.