Filtering Fasta Sequences using the #NCBI C++ API. My notebook.
In my previous post (http://plindenbaum.blogspot.com/2015/01/compiling-c-hello-world-program-using.html) I've built a simple "Hello World" application using the
NCBI C++ toolbox (http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/). Here, I'm gonna to extend the code in order to create a program filtering FASTA sequences on their sizes.
This new application FastaFilterSize
needs three new arguments:
'-m'
min size of the fasta sequence'-M'
max size of the fasta sequence'-v'
inverse selection
the method FastaFilterSize::Init() is used to register those optional arguments
/* optional argument for min size */
arg_desc->AddOptionalKey(
"m",
"min_size",
"min size inclusive",
CArgDescriptions::eInteger
);
/* optional argument for max size */
arg_desc->AddOptionalKey(
"M",
"max_size",
"max size inclusive",
CArgDescriptions::eInteger
);
/* add simple flag to inverse selection */
arg_desc->AddFlag("v",
"inverse selection",
true
);
the FastaFilterSize::Run() method is the workhorse of the program. Here, we get the output stream that will be used by a CFastaOstream to print the filtered Fasta sequences:
(...)
/* get output stream */
CNcbiOstream& out = this->GetArgs()["o"].AsOutputFile();
/* create a FASTA writer */
CFastaOstream* writer = new CFastaOstream(out);
(...)
We get the input stream that will be used by a CFastaReader to read the FASTA sequences to be filtered.
/* get input stream */
CNcbiIstream& input_stream = this->GetArgs()["i"].AsInputFile();
CFastaReader reader(input_stream ,
CFastaReader::fNoParseID /* http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/fasta_8hpp_source.html */
);
We get the optional values for min-length
and max-length
:
if (this->GetArgs()["m"])
{
min_inclusive = this->GetArgs()["m"].AsInteger();
}
if (this->GetArgs()["M"])
{
max_inclusive = this->GetArgs()["M"].AsInteger();
}
We get the 'inverse' flag.
bool invert=this->GetArgs()["v"];
While a Fasta sequence can be read, we check its size and we print it if needed:
while( !reader.AtEOF() )
{
/* read next sequence */
CRef<CSeq_entry> seq(reader.ReadOneSeq());
/* should we print the sequence ?*/
if((seq->GetSeq().GetLength()>= min_inclusive &&
seq->GetSeq().GetLength()<= max_inclusive
)!= invert)
{
/* yes, print */
writer->Write(*seq);
}
}
All in one:
#include <memory>
#include <limits>
#include <objtools/readers/fasta.hpp>
#include "corelib/ncbiapp.hpp"
#include <objects/seq/Bioseq.hpp>
#include <objects/seq/Bioseq.hpp>
#include <objects/seq/Delta_ext.hpp>
#include <objects/seq/Delta_seq.hpp>
#include <objects/seq/NCBIeaa.hpp>
#include <objects/seq/IUPACaa.hpp>
#include <objects/seq/IUPACna.hpp>
#include <objects/seq/Seg_ext.hpp>
#include <objects/seq/Seq_annot.hpp>
#include <objects/seq/Seq_descr.hpp>
#include <objects/seq/Seq_ext.hpp>
#include <objects/seq/Seq_hist.hpp>
#include <objects/seq/Seq_inst.hpp>
#include <objects/seq/Seq_literal.hpp>
#include <objects/seq/Seqdesc.hpp>
#include <objects/seq/seqport_util.hpp>
#include <objects/seqalign/Dense_seg.hpp>
#include <objects/seqalign/Seq_align.hpp>
#include <objects/seqloc/Seq_id.hpp>
#include <objects/seqloc/Seq_interval.hpp>
#include <objects/seqloc/Seq_loc.hpp>
#include <objects/seqloc/Seq_loc_mix.hpp>
#include <objects/seqloc/Seq_point.hpp>
#include <objects/seqset/Bioseq_set.hpp>
#include <objects/seqset/Seq_entry.hpp>
#include <objtools/readers/message_listener.hpp>
#include <objtools/readers/line_error.hpp>
#include <objtools/seqmasks_io/mask_reader.hpp>
#include <objtools/seqmasks_io/mask_writer.hpp>
#include <objtools/seqmasks_io/mask_fasta_reader.hpp>
#include <objmgr/util/sequence.hpp>
#include <ctype.h>
USING_NCBI_SCOPE;
USING_SCOPE(objects);
/**
* FastaFilterSize
*
* Motivation:
* filter sequences on size
* Author:
* Pierre Lindenbaum PhD 2015
*
*/
class FastaFilterSize : public CNcbiApplication /* extends a basic NCBI application defined in c++/include/corelib/ncbiapp.hpp */
{
public:
/* constructor, just set the version to '1.0.0'*/
FastaFilterSize()
{
CRef<CVersion> version(new CVersion());
version->SetVersionInfo(1, 0, 0);
this->SetFullVersion(version);
}
/* called to initialize rge program.
* The default behavior of this in 'CNcbiApplication' is "do nothing".
* Here, we set the command line arguments.
*/
virtual void Init()
{
CArgDescriptions* arg_desc = new CArgDescriptions ; /* defined in /c++/include/corelib/ncbiargs.hpp */
arg_desc->SetUsageContext(
GetArguments().GetProgramBasename(),
"A program filtering fasta sequences on size"
);
/* argument for input file */
arg_desc->AddDefaultKey(
"i",/* name */
"input_file_name",/* synopsis */
"input file name",/* comment */
CArgDescriptions::eInputFile, /* argument type */
"-" /* default value*/
);
/* argument for output file */
arg_desc->AddDefaultKey(
"o",/* name */
"output_file_name",/* synopsis */
"output file name",/* comment */
CArgDescriptions::eOutputFile, /* argument type */
"-" /* default value*/
);
/* optional argument for min size */
arg_desc->AddOptionalKey(
"m",
"min_size",
"min size inclusive",
CArgDescriptions::eInteger
);
/* optional argument for max size */
arg_desc->AddOptionalKey(
"M",
"max_size",
"max size inclusive",
CArgDescriptions::eInteger
);
/* add simple flag to inverse selection */
arg_desc->AddFlag("v",
"inverse selection",
true
);
/* push this command args */
SetupArgDescriptions( arg_desc );
}
/* class destructor */
virtual ~FastaFilterSize()
{
}
/* workhorse of the program */
virtual int Run()
{
/* get output stream */
CNcbiOstream& out = this->GetArgs()["o"].AsOutputFile();
/* create a FASTA writer */
CFastaOstream* writer = new CFastaOstream(out);
/* shall we inverse the selection ? */
bool invert=this->GetArgs()["v"];
/* get input stream */
CNcbiIstream& input_stream = this->GetArgs()["i"].AsInputFile();
/* init min and max inclusive */
TSeqPos min_inclusive=0;
TSeqPos max_inclusive= std::numeric_limits<TSeqPos>::max();
/* if min and max set by user */
if (this->GetArgs()["m"])
{
min_inclusive = this->GetArgs()["m"].AsInteger();
}
if (this->GetArgs()["M"])
{
max_inclusive = this->GetArgs()["M"].AsInteger();
}
/* create a FASTA parser */
CFastaReader reader(input_stream ,
CFastaReader::fNoParseID /* http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/fasta_8hpp_source.html */
);
/* while we can read something */
while( !reader.AtEOF() )
{
/* read next sequence */
CRef<CSeq_entry> seq(reader.ReadOneSeq());
/* should we print the sequence ?*/
if((seq->GetSeq().GetLength()>= min_inclusive &&
seq->GetSeq().GetLength()<= max_inclusive
)!= invert)
{
/* yes, print */
writer->Write(*seq);
}
}
/* cleanup */
delete writer;
out.flush();
return 0;
}
};
int main(int argc,char** argv)
{
return FastaFilterSize().AppMain(
argc,
argv,
0, /* envp Environment pointer */
eDS_ToStderr,/* log message. In /c++/include/corelib/ncbidiag.hpp */
0, /* Specify registry to load, as per LoadConfig() */
"filterfasta" /* Specify application name */
);
}
Compiling
g++ -std=gnu++11 -finline-functions -fstrict-aliasing -fomit-frame-pointer
-I/commun/data/users/lindenb/src/blast2.2.30/ncbi-blast-2.2.30+-src/c++/include
-I/commun/data/users/lindenb/src/blast2.2.30/ncbi-blast-2.2.30+-src/c++/ReleaseM
T/inc -Wno-deprecated -Wno-deprecated-declarations -o filtersize filtersize.cpp
-L/commun/data/users/lindenb/src/blast2.2.30/ncbi-blast-2.2.30+-src/c++/Release
MT/lib -lblast_app_util -lblastinput-static -lncbi_xloader_blastdb-static -lncbi
_xloader_blastdb_rmt-static -lxblastformat-static -lalign_format-static -ltaxon1
-static -lblastdb_format-static -lgene_info-static -lxalnmgr-static -lblastxml-s
tatic -lblastxml2-static -lxcgi-static -lxhtml-static -lxblast-static -lxalgobla
stdbindex-static -lcomposition_adjustment-static -lxalgodustmask-static -lxalgow
inmask-static -lseqmasks_io-static -lseqdb-static -lblast_services-static -lxobj
util-static -lxobjread-static -lvariation-static -lcreaders-static -lsubmit-stat
ic -lxnetblastcli-static -lxnetblast-static -lblastdb-static -lscoremat-static -
ltables-static -lxalnmgr-static -lncbi_xloader_genbank-static -lncbi_xreader_id1
-static -lncbi_xreader_id2-static -lncbi_xreader_cache-static -lxconnext-static
-lxconnect-static -ldbapi_driver-static -lncbi_xreader-static -lxconnext-static
-lxconnect-static -lid1-static -lid2-static -lseqsplit-static -lxcompress-static
-lxobjmgr-static -lgenome_collection-static -lseqedit-static -lseqset-static -l
seq-static -lseqcode-static -lsequtil-static -lpub-static -lmedline-static -lbib
lio-static -lgeneral-static -lxser-static -lxutil-static -lxncbi-static -lgomp -
lz -lbz2 -ldl -lnsl -lrt -lm -lpthread -L/commun/data/users/lindenb/src/blast2.2
.30/ncbi-blast-2.2.30+-src/c++/ReleaseMT/lib -lblast_app_util -lblastinput-stati
c -lncbi_xloader_blastdb-static -lncbi_xloader_blastdb_rmt-static -lxblastformat
-static -lalign_format-static -ltaxon1-static -lblastdb_format-static -lgene_inf
o-static -lxalnmgr-static -lblastxml-static -lblastxml2-static -lxcgi-static -lx
html-static -lxblast-static -lxalgoblastdbindex-static -lcomposition_adjustment-
static -lxalgodustmask-static -lxalgowinmask-static -lseqmasks_io-static -lseqdb
-static -lblast_services-static -lxobjutil-static -lxobjread-static -lvariation-
static -lcreaders-static -lsubmit-static -lxnetblastcli-static -lxnetblast-stati
c -lblastdb-static -lscoremat-static -ltables-static -lxalnmgr-static -lncbi_xlo
ader_genbank-static -lncbi_xreader_id1-static -lncbi_xreader_id2-static -lncbi_x
reader_cache-static -lxconnext-static -lxconnect-static -ldbapi_driver-static -l
ncbi_xreader-static -lxconnext-static -lxconnect-static -lid1-static -lid2-stati
c -lseqsplit-static -lxcompress-static -lxobjmgr-static -lgenome_collection-stat
ic -lseqedit-static -lseqset-static -lseq-static -lseqcode-static -lsequtil-stat
ic -lpub-static -lmedline-static -lbiblio-static -lgeneral-static -lxser-static
-lxutil-static -lxncbi-static -lgomp -lz -lbz2 -ldl -lnsl -lrt -lm -lpthread
Testing
get help
$ ./filtersize -help
USAGE
filtersize [-h] [-help] [-xmlhelp] [-i input_file_name]
[-o output_file_name] [-m min_size] [-M max_size] [-v]
[-logfile File_Name] [-conffile File_Name] [-version] [-version-full]
[-dryrun]
DESCRIPTION
A program filtering fasta sequences on size
OPTIONAL ARGUMENTS
(...)
-i <File_In>
input file name
Default = `-'
-o <File_Out>
output file name
Default = `-'
-m <Integer>
min size inclusive
-M <Integer>
max size inclusive
-v
inverse selection
(...)
test with some fasta sequences:
echo -e ">seq1\nAATCGTACGCTAG\n>seq2\nAATCGTACGATGCAAAAAAAAAAAATCTAG\n>seq3\nAATC\n>seq3\nAATC" |\
./filtersize -m 10 -M 20
>lcl|1 seq1
AATCGTACGCTAG
echo -e ">seq1\nAATCGTACGCTAG\n>seq2\nAATCGTACGATGCAAAAAAAAAAAATCTAG\n>seq3\nAATC\n>seq3\nAATC" |\
./filtersize -m 10 -M 20 -v
>lcl|2 seq2
AATCGTACGATGCAAAAAAAAAAAATCTAG
>lcl|3 seq3
AATC
>lcl|4 seq3
AATC
That's it,
Pierre
No comments:
Post a Comment