09 July 2011

Storing SNPs in a HDF5 file: my notebook

I discovered the benefits of using twitter in 2008, the day Deepak Singh replied to one of my tweets related to the storage of a large number of genotypes.





Since that day, I've tried to use the HDF5 library, without any success (there's a large disheartening documentation/API on the HDF5 site and the API seems to be only used by a small number of geeks). Furthermore, HDF5 is a technology used by the IGV genome browser.

In that post, I'll describe a C program loading a set of SNPs defined by the following C structure:
typedef struct structSnp
{
char rsId[RS_LENGTH];//rs##
char chrom[CHROM_LENGTH];//chromosome name
int chromStart;//genomic start index
int chromEnd;//genomic end index
}Snp;
The SNPs will be read from stdin. Four columns are expected: rs-id, chrom, chromStart and chromEnd. Here is the command line I used to get a small input file:
curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp132.txt.gz" |\
gunzip -c |\
cut -d ' ' -f 2-5 |\
head -n 10000 > sample.tsv

Source code


The C program is described below. I hope my comments will make the code readable.
/**
* Author:
* Pierre Lindenbaum PhD
* Date:
* 2011-07-08
* Contact:
* plindenbaum@yahoo.fr
* WWW:
* http://plindenbaum.blogspot.com
* Reference:
* http://plindenbaum.blogspot.com/2011/07/storing-snps-in-hdf5-file-my-notebook.html
* Motivation:
* learning HDF5
* inserting some SNPs in a HDF5 file
* History:
* 2011-07-10 add a string attribute
* 2011-07-09 1st version
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <hdf5.h>
#define FATAL do { fprintf(stderr,"%s:%d failure.\n",__FILE__,__LINE__); exit(EXIT_FAILURE); } while(0)
#define ASSERT_POSITIVE(a) if(a<0) FATAL
#define RS_LENGTH 13
#define CHROM_LENGTH 7
//http://stackoverflow.com/questions/5459868
#define STR_HELPER(x) #x
#define CASTSTR(x) STR_HELPER(x)
/** structure holding a SNP */
typedef struct structSnp
{
char rsId[RS_LENGTH];//rs##
char chrom[CHROM_LENGTH];//chromosome name
int chromStart;//genomic start index
int chromEnd;//genomic end index
}Snp;
int main(int argc,char** argv)
{
char line[BUFSIZ];
hid_t file_id;
hid_t group_id;
hid_t snp_tid;
hid_t rs_type_id;
hid_t chrom_type_id;
hid_t dataset_id;
hid_t dataspace_id;
hid_t memspace_id;
hid_t props;
hid_t attribute_id;
hid_t attribute_dataspace_id;
hid_t attribute_type_id;
hsize_t dims[1];
hsize_t maxdims[1];
hsize_t chunk_size[1];
hsize_t extend[1];
hsize_t oneElement[]={1};
herr_t err;
char attribute_str[]="My list of SNPs";
int countSnp=0;
/* create datatype for rs## name */
rs_type_id = H5Tcopy(H5T_C_S1);
ASSERT_POSITIVE(rs_type_id);
err = H5Tset_size(rs_type_id,RS_LENGTH);
ASSERT_POSITIVE(err);
/* create datatype for chromosome name */
chrom_type_id = H5Tcopy(H5T_C_S1);
ASSERT_POSITIVE(chrom_type_id);
err = H5Tset_size(chrom_type_id,CHROM_LENGTH);
ASSERT_POSITIVE(err);
/* create SNP type ID */
snp_tid = H5Tcreate (H5T_COMPOUND, sizeof(Snp));
ASSERT_POSITIVE(snp_tid);
err=H5Tinsert(snp_tid, "rs",HOFFSET(Snp, rsId),rs_type_id);
ASSERT_POSITIVE(err);
err=H5Tinsert(snp_tid, "chrom",HOFFSET(Snp, chrom),chrom_type_id);
ASSERT_POSITIVE(err);
err=H5Tinsert(snp_tid, "chromStart",HOFFSET(Snp, chromStart), H5T_NATIVE_INT);
ASSERT_POSITIVE(err);
err=H5Tinsert(snp_tid, "chromEnd", HOFFSET(Snp, chromEnd), H5T_NATIVE_INT);
ASSERT_POSITIVE(err);
/* Create HDF5 files. */
file_id = H5Fcreate(
"storage.h5",
H5F_ACC_TRUNC,//flags
H5P_DEFAULT,//create_id
H5P_DEFAULT //access id
);
ASSERT_POSITIVE(file_id);
/* create group */
group_id= H5Gcreate2(
file_id,
"variations", //path
H5P_DEFAULT, //Property list for link creation
H5P_DEFAULT,// Property list for group creation
H5P_DEFAULT // Property list for group access
);
ASSERT_POSITIVE(group_id);
/* create dataspace */
dims[0]=0;
maxdims[0]=H5S_UNLIMITED;
dataspace_id = H5Screate_simple(
1,//Number of dimensions of dataspace
dims,//Array specifying the size of each dimension
maxdims //Array specifying the maximum size of each dimension
);
ASSERT_POSITIVE(dataspace_id);
/* create params for creating dataset */
props = H5Pcreate (H5P_DATASET_CREATE);
ASSERT_POSITIVE(props);
/* Sets the size of the chunks used to store a chunked layout dataset. */
chunk_size[0]=100;
err=H5Pset_chunk(props,1, chunk_size);
ASSERT_POSITIVE(err);
/* create dataset */
dataset_id = H5Dcreate2(
group_id,
"dbSNP",
snp_tid,//datatype
dataspace_id,//dataspace
H5P_DEFAULT,//Link creation property list
props,//Dataset creation property list
H5P_DEFAULT //Dataset access property list
);
ASSERT_POSITIVE(dataset_id);
/* create simple memory dataspace */
memspace_id=H5Screate_simple(
1,//num dimension
oneElement,// size of each dimension.
NULL);
ASSERT_POSITIVE(memspace_id);
/** read stdin. 4 columns expected
- chrom
- chromStart
- chromEnd
- rsId
*/
while(fgets(line,BUFSIZ,stdin)!=NULL)
{
hid_t filespace_id;
hsize_t offset[1];
Snp snp;
//read line content
if(sscanf (line, "%" CASTSTR(CHROM_LENGTH) "s %d %d %" CASTSTR(RS_LENGTH) "s",
snp.chrom,
&snp.chromStart,
&snp.chromEnd,
snp.rsId)!=4) //expect 4 fields
{
fprintf(stderr,"Error in line %s\n",line);
FATAL;
}
//check data
if(strncmp(snp.chrom,"chr",3)!=0 ||
strncmp(snp.rsId,"rs",2)!=0 ||
snp.chromStart<0 ||
snp.chromEnd <snp.chromStart
)
{
fprintf(stderr,"Error in line %s\n",line);
FATAL;
}
//increase the number of snp
++countSnp;
/* extend the dataset */
extend[0]=countSnp;
err = H5Dextend(dataset_id, extend);
ASSERT_POSITIVE(err);
// Selects a hyperslab region to add to the current selected region. one record.
offset[0]=countSnp-1;
//get current filespace
filespace_id = H5Dget_space(dataset_id);
ASSERT_POSITIVE(filespace_id);
//Selects a hyperslab region to add to the current selected region.
err = H5Sselect_hyperslab(
filespace_id,//Identifier of dataspace selection to modify
H5S_SELECT_SET,//replaces the existing selection with the parameters from this call.
offset,//Offset of start of hyperslab
NULL,//Number of blocks . NULL : one element
oneElement,//Hyperslab stride
NULL//Size of block in hyperslab. null=single element
);
ASSERT_POSITIVE(err);
/* Write the data to the hyperslab */
err = H5Dwrite (
dataset_id,//data set
snp_tid,//data type
memspace_id,//mem_space_id
filespace_id,//file_space_id
H5P_DEFAULT,//properties
(void*) &snp //data to write
);
ASSERT_POSITIVE(err);
err = H5Sclose (filespace_id);
ASSERT_POSITIVE(err);
}
/* add an attribute to the dataset */
attribute_dataspace_id = H5Screate(H5S_SCALAR);
ASSERT_POSITIVE(attribute_dataspace_id);
/* If the application wishes to store character data, then an HDF5 string datatype should be derived from H5T_C_S1 instead. */
attribute_type_id = H5Tcopy(H5T_C_S1);
ASSERT_POSITIVE(attribute_type_id);
/* is total size of the datum in bytes, including padding */
err=H5Tset_size(attribute_type_id, strlen(attribute_str));
ASSERT_POSITIVE(err);
/* Defines the storage mechanism for character strings. */
err=H5Tset_strpad(attribute_type_id,H5T_STR_NULLTERM);//Null terminate (as C does).
ASSERT_POSITIVE(err);
/** create attribute */
attribute_id= H5Acreate2(
dataset_id,
"rdfs:comment",
attribute_type_id,
attribute_dataspace_id,//Attribute dataspace identifier
H5P_DEFAULT,//Attribute creation property list identifier
H5P_DEFAULT//Attribute access property list identifier
);
ASSERT_POSITIVE(attribute_id);
err = H5Awrite(attribute_id, attribute_type_id, attribute_str);
ASSERT_POSITIVE(err);
/* close attribute */
err = H5Aclose(attribute_id);
ASSERT_POSITIVE(err);
/* close attribute type */
err = H5Tclose(attribute_type_id);
ASSERT_POSITIVE(err);
/* close attribute space */
err = H5Sclose(attribute_dataspace_id);
ASSERT_POSITIVE(err);
/* close dataset props */
err=H5Pclose(props);
ASSERT_POSITIVE(err);
/* close memspace */
err=H5Sclose(memspace_id);
ASSERT_POSITIVE(err);
/* close dataset */
err=H5Dclose(dataset_id);
ASSERT_POSITIVE(err);
/* close dataspace */
err=H5Sclose(dataspace_id);
ASSERT_POSITIVE(err);
/* close data type rs## */
err=H5Tclose(rs_type_id);
ASSERT_POSITIVE(err);
/* close data type chrom */
err=H5Tclose(chrom_type_id);
ASSERT_POSITIVE(err);
/* close group */
err=H5Gclose(group_id);
ASSERT_POSITIVE(err);
/* close file */
err=H5Fclose(file_id);
ASSERT_POSITIVE(err);
return EXIT_SUCCESS;
}
view raw dbsnp2hdf5.c hosted with ❤ by GitHub

The Makefile


HDF5HOME=${HOME}/package/hdf5
CFLAGS=-I ${HDF5HOME}/include
LDFLAGS=-L ${HDF5HOME}/lib
all:test
test:sample.tsv a.out
./a.out < sample.tsv
${HDF5HOME}/bin/h5dump storage.h5
a.out:dbsnp2hdf5.c
gcc -Wall -o $@ $(CFLAGS) ${LDFLAGS} dbsnp2hdf5.c -lhdf5
sample.tsv:
curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp132.txt.gz" |\
gunzip -c |\
cut -d ' ' -f 2-5 |\
head -n 10000 > $@
view raw Makefile hosted with ❤ by GitHub

Compile and run

gcc -Wall -o a.out -I  /path/to/hdf5/include -L /path/to/hdf5/lib dbsnp2hdf5.c  -lhdf5
./a.out < sample.tsv

Dump the data


At the en, the program creates a structured binary file containing our SNPs. The HDF5 utility h5dump can be used to display the data as text:
$ h5dump storage.h5

HDF5 "storage.h5" {
GROUP "/" {
GROUP "variations" {
DATASET "dbSNP" {
DATATYPE H5T_COMPOUND {
H5T_STRING {
STRSIZE 13;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
} "rs";
H5T_STRING {
STRSIZE 7;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
} "chrom";
H5T_STD_I32LE "chromStart";
H5T_STD_I32LE "chromEnd";
}
DATASPACE SIMPLE { ( 10000 ) / ( H5S_UNLIMITED ) }
DATA {
(0): {
"rs112750067",
"chr1",
10326,
10327
},
(1): {
"rs56289060",
"chr1",
10433,
10433
},
(2): {
"rs112155239",
"chr1",
10439,
10440
},
(3): {
"rs112766696",
"chr1",
10439,
10440
},
(...)
450425,
450426
}
}
ATTRIBUTE "rdfs:comment" {
DATATYPE H5T_STRING {
STRSIZE 15;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
DATA {
(0): "My list of SNPs"
}
}
}
}
}
}


That's it !
Pierre

1 comment:

Anonymous said...

Well and then there's GWASpi, which loads genotypes in a HDF compatible netCDF file.
It seems to be the same type of storage as you can open up netCDf files with HDFView.

For sequences (reads and alignments) there's also the BioHDF libraries which do all the storage and I/O work for you.