Storing SNPs in a HDF5 file: my notebook
I discovered the benefits of using twitter in 2008, the day Deepak Singh replied to one of my tweets related to the storage of a large number of genotypes.
Since that day, I've tried to use the HDF5 library, without any success (there's a large disheartening documentation/API on the HDF5 site and the API seems to be only used by a small number of geeks). Furthermore, HDF5 is a technology used by the IGV genome browser.
In that post, I'll describe a C program loading a set of SNPs defined by the following C structure:
typedef struct structSnpThe SNPs will be read from stdin. Four columns are expected: rs-id, chrom, chromStart and chromEnd. Here is the command line I used to get a small input file:
{
char rsId[RS_LENGTH];//rs##
char chrom[CHROM_LENGTH];//chromosome name
int chromStart;//genomic start index
int chromEnd;//genomic end index
}Snp;
curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp132.txt.gz" |\
gunzip -c |\
cut -d ' ' -f 2-5 |\
head -n 10000 > sample.tsv
Source code
The C program is described below. I hope my comments will make the code readable.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Author: | |
* Pierre Lindenbaum PhD | |
* Date: | |
* 2011-07-08 | |
* Contact: | |
* plindenbaum@yahoo.fr | |
* WWW: | |
* http://plindenbaum.blogspot.com | |
* Reference: | |
* http://plindenbaum.blogspot.com/2011/07/storing-snps-in-hdf5-file-my-notebook.html | |
* Motivation: | |
* learning HDF5 | |
* inserting some SNPs in a HDF5 file | |
* History: | |
* 2011-07-10 add a string attribute | |
* 2011-07-09 1st version | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <hdf5.h> | |
#define FATAL do { fprintf(stderr,"%s:%d failure.\n",__FILE__,__LINE__); exit(EXIT_FAILURE); } while(0) | |
#define ASSERT_POSITIVE(a) if(a<0) FATAL | |
#define RS_LENGTH 13 | |
#define CHROM_LENGTH 7 | |
//http://stackoverflow.com/questions/5459868 | |
#define STR_HELPER(x) #x | |
#define CASTSTR(x) STR_HELPER(x) | |
/** structure holding a SNP */ | |
typedef struct structSnp | |
{ | |
char rsId[RS_LENGTH];//rs## | |
char chrom[CHROM_LENGTH];//chromosome name | |
int chromStart;//genomic start index | |
int chromEnd;//genomic end index | |
}Snp; | |
int main(int argc,char** argv) | |
{ | |
char line[BUFSIZ]; | |
hid_t file_id; | |
hid_t group_id; | |
hid_t snp_tid; | |
hid_t rs_type_id; | |
hid_t chrom_type_id; | |
hid_t dataset_id; | |
hid_t dataspace_id; | |
hid_t memspace_id; | |
hid_t props; | |
hid_t attribute_id; | |
hid_t attribute_dataspace_id; | |
hid_t attribute_type_id; | |
hsize_t dims[1]; | |
hsize_t maxdims[1]; | |
hsize_t chunk_size[1]; | |
hsize_t extend[1]; | |
hsize_t oneElement[]={1}; | |
herr_t err; | |
char attribute_str[]="My list of SNPs"; | |
int countSnp=0; | |
/* create datatype for rs## name */ | |
rs_type_id = H5Tcopy(H5T_C_S1); | |
ASSERT_POSITIVE(rs_type_id); | |
err = H5Tset_size(rs_type_id,RS_LENGTH); | |
ASSERT_POSITIVE(err); | |
/* create datatype for chromosome name */ | |
chrom_type_id = H5Tcopy(H5T_C_S1); | |
ASSERT_POSITIVE(chrom_type_id); | |
err = H5Tset_size(chrom_type_id,CHROM_LENGTH); | |
ASSERT_POSITIVE(err); | |
/* create SNP type ID */ | |
snp_tid = H5Tcreate (H5T_COMPOUND, sizeof(Snp)); | |
ASSERT_POSITIVE(snp_tid); | |
err=H5Tinsert(snp_tid, "rs",HOFFSET(Snp, rsId),rs_type_id); | |
ASSERT_POSITIVE(err); | |
err=H5Tinsert(snp_tid, "chrom",HOFFSET(Snp, chrom),chrom_type_id); | |
ASSERT_POSITIVE(err); | |
err=H5Tinsert(snp_tid, "chromStart",HOFFSET(Snp, chromStart), H5T_NATIVE_INT); | |
ASSERT_POSITIVE(err); | |
err=H5Tinsert(snp_tid, "chromEnd", HOFFSET(Snp, chromEnd), H5T_NATIVE_INT); | |
ASSERT_POSITIVE(err); | |
/* Create HDF5 files. */ | |
file_id = H5Fcreate( | |
"storage.h5", | |
H5F_ACC_TRUNC,//flags | |
H5P_DEFAULT,//create_id | |
H5P_DEFAULT //access id | |
); | |
ASSERT_POSITIVE(file_id); | |
/* create group */ | |
group_id= H5Gcreate2( | |
file_id, | |
"variations", //path | |
H5P_DEFAULT, //Property list for link creation | |
H5P_DEFAULT,// Property list for group creation | |
H5P_DEFAULT // Property list for group access | |
); | |
ASSERT_POSITIVE(group_id); | |
/* create dataspace */ | |
dims[0]=0; | |
maxdims[0]=H5S_UNLIMITED; | |
dataspace_id = H5Screate_simple( | |
1,//Number of dimensions of dataspace | |
dims,//Array specifying the size of each dimension | |
maxdims //Array specifying the maximum size of each dimension | |
); | |
ASSERT_POSITIVE(dataspace_id); | |
/* create params for creating dataset */ | |
props = H5Pcreate (H5P_DATASET_CREATE); | |
ASSERT_POSITIVE(props); | |
/* Sets the size of the chunks used to store a chunked layout dataset. */ | |
chunk_size[0]=100; | |
err=H5Pset_chunk(props,1, chunk_size); | |
ASSERT_POSITIVE(err); | |
/* create dataset */ | |
dataset_id = H5Dcreate2( | |
group_id, | |
"dbSNP", | |
snp_tid,//datatype | |
dataspace_id,//dataspace | |
H5P_DEFAULT,//Link creation property list | |
props,//Dataset creation property list | |
H5P_DEFAULT //Dataset access property list | |
); | |
ASSERT_POSITIVE(dataset_id); | |
/* create simple memory dataspace */ | |
memspace_id=H5Screate_simple( | |
1,//num dimension | |
oneElement,// size of each dimension. | |
NULL); | |
ASSERT_POSITIVE(memspace_id); | |
/** read stdin. 4 columns expected | |
- chrom | |
- chromStart | |
- chromEnd | |
- rsId | |
*/ | |
while(fgets(line,BUFSIZ,stdin)!=NULL) | |
{ | |
hid_t filespace_id; | |
hsize_t offset[1]; | |
Snp snp; | |
//read line content | |
if(sscanf (line, "%" CASTSTR(CHROM_LENGTH) "s %d %d %" CASTSTR(RS_LENGTH) "s", | |
snp.chrom, | |
&snp.chromStart, | |
&snp.chromEnd, | |
snp.rsId)!=4) //expect 4 fields | |
{ | |
fprintf(stderr,"Error in line %s\n",line); | |
FATAL; | |
} | |
//check data | |
if(strncmp(snp.chrom,"chr",3)!=0 || | |
strncmp(snp.rsId,"rs",2)!=0 || | |
snp.chromStart<0 || | |
snp.chromEnd <snp.chromStart | |
) | |
{ | |
fprintf(stderr,"Error in line %s\n",line); | |
FATAL; | |
} | |
//increase the number of snp | |
++countSnp; | |
/* extend the dataset */ | |
extend[0]=countSnp; | |
err = H5Dextend(dataset_id, extend); | |
ASSERT_POSITIVE(err); | |
// Selects a hyperslab region to add to the current selected region. one record. | |
offset[0]=countSnp-1; | |
//get current filespace | |
filespace_id = H5Dget_space(dataset_id); | |
ASSERT_POSITIVE(filespace_id); | |
//Selects a hyperslab region to add to the current selected region. | |
err = H5Sselect_hyperslab( | |
filespace_id,//Identifier of dataspace selection to modify | |
H5S_SELECT_SET,//replaces the existing selection with the parameters from this call. | |
offset,//Offset of start of hyperslab | |
NULL,//Number of blocks . NULL : one element | |
oneElement,//Hyperslab stride | |
NULL//Size of block in hyperslab. null=single element | |
); | |
ASSERT_POSITIVE(err); | |
/* Write the data to the hyperslab */ | |
err = H5Dwrite ( | |
dataset_id,//data set | |
snp_tid,//data type | |
memspace_id,//mem_space_id | |
filespace_id,//file_space_id | |
H5P_DEFAULT,//properties | |
(void*) &snp //data to write | |
); | |
ASSERT_POSITIVE(err); | |
err = H5Sclose (filespace_id); | |
ASSERT_POSITIVE(err); | |
} | |
/* add an attribute to the dataset */ | |
attribute_dataspace_id = H5Screate(H5S_SCALAR); | |
ASSERT_POSITIVE(attribute_dataspace_id); | |
/* If the application wishes to store character data, then an HDF5 string datatype should be derived from H5T_C_S1 instead. */ | |
attribute_type_id = H5Tcopy(H5T_C_S1); | |
ASSERT_POSITIVE(attribute_type_id); | |
/* is total size of the datum in bytes, including padding */ | |
err=H5Tset_size(attribute_type_id, strlen(attribute_str)); | |
ASSERT_POSITIVE(err); | |
/* Defines the storage mechanism for character strings. */ | |
err=H5Tset_strpad(attribute_type_id,H5T_STR_NULLTERM);//Null terminate (as C does). | |
ASSERT_POSITIVE(err); | |
/** create attribute */ | |
attribute_id= H5Acreate2( | |
dataset_id, | |
"rdfs:comment", | |
attribute_type_id, | |
attribute_dataspace_id,//Attribute dataspace identifier | |
H5P_DEFAULT,//Attribute creation property list identifier | |
H5P_DEFAULT//Attribute access property list identifier | |
); | |
ASSERT_POSITIVE(attribute_id); | |
err = H5Awrite(attribute_id, attribute_type_id, attribute_str); | |
ASSERT_POSITIVE(err); | |
/* close attribute */ | |
err = H5Aclose(attribute_id); | |
ASSERT_POSITIVE(err); | |
/* close attribute type */ | |
err = H5Tclose(attribute_type_id); | |
ASSERT_POSITIVE(err); | |
/* close attribute space */ | |
err = H5Sclose(attribute_dataspace_id); | |
ASSERT_POSITIVE(err); | |
/* close dataset props */ | |
err=H5Pclose(props); | |
ASSERT_POSITIVE(err); | |
/* close memspace */ | |
err=H5Sclose(memspace_id); | |
ASSERT_POSITIVE(err); | |
/* close dataset */ | |
err=H5Dclose(dataset_id); | |
ASSERT_POSITIVE(err); | |
/* close dataspace */ | |
err=H5Sclose(dataspace_id); | |
ASSERT_POSITIVE(err); | |
/* close data type rs## */ | |
err=H5Tclose(rs_type_id); | |
ASSERT_POSITIVE(err); | |
/* close data type chrom */ | |
err=H5Tclose(chrom_type_id); | |
ASSERT_POSITIVE(err); | |
/* close group */ | |
err=H5Gclose(group_id); | |
ASSERT_POSITIVE(err); | |
/* close file */ | |
err=H5Fclose(file_id); | |
ASSERT_POSITIVE(err); | |
return EXIT_SUCCESS; | |
} |
The Makefile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
HDF5HOME=${HOME}/package/hdf5 | |
CFLAGS=-I ${HDF5HOME}/include | |
LDFLAGS=-L ${HDF5HOME}/lib | |
all:test | |
test:sample.tsv a.out | |
./a.out < sample.tsv | |
${HDF5HOME}/bin/h5dump storage.h5 | |
a.out:dbsnp2hdf5.c | |
gcc -Wall -o $@ $(CFLAGS) ${LDFLAGS} dbsnp2hdf5.c -lhdf5 | |
sample.tsv: | |
curl -s "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp132.txt.gz" |\ | |
gunzip -c |\ | |
cut -d ' ' -f 2-5 |\ | |
head -n 10000 > $@ |
Compile and run
gcc -Wall -o a.out -I /path/to/hdf5/include -L /path/to/hdf5/lib dbsnp2hdf5.c -lhdf5
./a.out < sample.tsv
Dump the data
At the en, the program creates a structured binary file containing our SNPs. The HDF5 utility h5dump can be used to display the data as text:
$ h5dump storage.h5
HDF5 "storage.h5" {
GROUP "/" {
GROUP "variations" {
DATASET "dbSNP" {
DATATYPE H5T_COMPOUND {
H5T_STRING {
STRSIZE 13;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
} "rs";
H5T_STRING {
STRSIZE 7;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
} "chrom";
H5T_STD_I32LE "chromStart";
H5T_STD_I32LE "chromEnd";
}
DATASPACE SIMPLE { ( 10000 ) / ( H5S_UNLIMITED ) }
DATA {
(0): {
"rs112750067",
"chr1",
10326,
10327
},
(1): {
"rs56289060",
"chr1",
10433,
10433
},
(2): {
"rs112155239",
"chr1",
10439,
10440
},
(3): {
"rs112766696",
"chr1",
10439,
10440
},
(...)
450425,
450426
}
}
ATTRIBUTE "rdfs:comment" {
DATATYPE H5T_STRING {
STRSIZE 15;
STRPAD H5T_STR_NULLTERM;
CSET H5T_CSET_ASCII;
CTYPE H5T_C_S1;
}
DATASPACE SCALAR
DATA {
(0): "My list of SNPs"
}
}
}
}
}
}
That's it !
Pierre
1 comment:
Well and then there's GWASpi, which loads genotypes in a HDF compatible netCDF file.
It seems to be the same type of storage as you can open up netCDf files with HDFView.
For sequences (reads and alignments) there's also the BioHDF libraries which do all the storage and I/O work for you.
Post a Comment