02 February 2015

Listing the 'Subject' Sequences in a BLAST database using the NCBI C++ toolbox. My notebook.

In my previous post (http://plindenbaum.blogspot.com/2015/01/filtering-fasta-sequences-using-ncbi-c.html) I've built an application filtering FASTA sequences using the
NCBI C++ toolbox (http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/). Here, I'm gonna write a tool listing the 'subject' sequences in a BLAST database.

This new application ListBlastDatabaseContent takes only one argument '-db', the name of the blast database :

void ListBlastDatabaseContent::Init()
    {
    (...)
    /* argument for name */
    arg_desc->AddDefaultKey(
            "db",/* name */
            "database",/* synopsis */
            "Blast database",/* comment */
            CArgDescriptions::eString, /* argument type */
            "" /* default value*/
            );
    (...)
    }   

in ListBlastDatabaseContent::Run() we get the name of the database

(...)
string database_name(  this->GetArgs()["db"].AsString() );
(...)

and we open a new CSearchDatabase object ("Blast Search Subject")

CSearchDatabase* searchDatabase = new CSearchDatabase(
    database_name, /* db name */
    CSearchDatabase::eBlastDbIsNucleotide /* db type */
    );

We get a handle to the seqdb, "it defines access to the database component by calling methods on objects which represent the various database files, such as the index, header, sequence, and alias files".

CRef<CSeqDB> seqdb= searchDatabase->GetSeqDb();

We get an iterator to the seqdb database http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/classCSeqDBIter.html:

CSeqDBIter  iter = seqdb->Begin();

While this iterator is valid, we retrieve and print the DNA sequence and the header associated to the iterator.

while(iter)
    {
    string output;
    TSeqRange range; /* entire sequence */
    seqdb->GetSequenceAsString(iter.GetOID(),output,range); 
    CRef< CBlast_def_line_set > def_line= seqdb->GetHdr(iter.GetOID());
    CBlast_def_line_set:: Tdata::const_iterator r = def_line->Get().begin();
    while(r!= def_line->Get().end())
        {
        cout << ">" << (*r)->GetTitle()  << endl;
        cout << output << endl;
        ++r;
        }
    ++iter;
    }

All in one:

#include <memory>
#include <limits>
#include "corelib/ncbiapp.hpp"
#include <algo/blast/api/blast_types.hpp>
#include <algo/blast/api/sseqloc.hpp>
#include <algo/blast/api/blast_aux.hpp>
#include <algo/blast/api/blast_options_handle.hpp>
#include <algo/blast/api/blast_results.hpp>
#include <algo/blast/api/local_blast.hpp>
#include <ctype.h>



USING_NCBI_SCOPE;
using namespace blast;

/**
 *  Name
 *      ListBlastDatabaseContent
 *  Motivation:
 *      Print content of blast database
 *  Author:
 *      Pierre Lindenbaum PhD 2015
 *
 */
class ListBlastDatabaseContent : public CNcbiApplication /* extends a basic NCBI application defined in c++/include/corelib/ncbiapp.hpp */
    {
    public:

        /* constructor, just set the version to '1.0.0'*/
        ListBlastDatabaseContent()
            {
            CRef<CVersion> version(new CVersion());
            version->SetVersionInfo(1, 0, 0);
            this->SetFullVersion(version);
            }
            
            
        /* called to initialize rge program.
        *  The default behavior of this in 'CNcbiApplication' is "do nothing".
        *  Here, we set the command line arguments.
        */
        virtual void Init()
            {
            CArgDescriptions* arg_desc = new CArgDescriptions ; /* defined in /c++/include/corelib/ncbiargs.hpp */
            arg_desc->SetUsageContext(
                GetArguments().GetProgramBasename(),
                __FILE__
                );
            
         
            /* argument for name */
           arg_desc->AddDefaultKey(
                    "db",/* name */
                    "database",/* synopsis */
                    "Blast database",/* comment */
                    CArgDescriptions::eString, /* argument type */
                    "" /* default value*/
                    );
            
            /* push this command args */
            SetupArgDescriptions( arg_desc );
            }   
        
        /* class destructor */
        virtual ~ListBlastDatabaseContent()
            {
            }
        
        /* workhorse of the program */
        virtual int Run()
            {
            string database_name(  this->GetArgs()["db"].AsString() );
            /* Blast Search Subject.  http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/classCSearchDatabase.html#details */
            CSearchDatabase* searchDatabase = 0;
            
        
            /* initialize search database */
            searchDatabase = new CSearchDatabase(
                database_name, /* db name */
                CSearchDatabase::eBlastDbIsNucleotide /* db type */
                );
            
            /* This class provides the top-level interface class for BLAST database users. It 
            defines access to the database component by calling methods on objects which represent
            the various database files, such as the index, header, sequence, and alias files. 
            http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/classCSeqDB.html */
            CRef<CSeqDB> seqdb= searchDatabase->GetSeqDb();                 
            
            /* Small class to iterate over a seqdb database. http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/classCSeqDBIter.html#details */
            CSeqDBIter  iter = seqdb->Begin();
            
            while(iter)
                {
                /* we put the sequence here */
                 string output;
                 /* retrieve the entire sequence */
                 TSeqRange range;
                /* This method gets the sequence data for an OID, converts it to a human-readable encoding (either Iupacaa for protein, or Iupacna for nucleotide), and returns it in a string. This is equivalent to calling the three-argument versions of this method with those encodings. http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/classCSeqDB.html */
                seqdb->GetSequenceAsString(iter.GetOID(),output,range); 
                /* Represents ASN.1 type Blast-def-line-set defined in file blastdb.asn http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/doxyhtml/classCBlast__def__line__set.html */
                CRef< CBlast_def_line_set > def_line= seqdb->GetHdr(iter.GetOID());
                CBlast_def_line_set:: Tdata::const_iterator r = def_line->Get().begin();
                while(r!= def_line->Get().end())
                    {
                    cout << ">" << (*r)->GetTitle()  << endl;
                    cout << output << endl;
                    ++r;
                    }

                ++iter;
                }
            if( searchDatabase != 0) delete searchDatabase;
            return 0;
            }
    };

int main(int argc,char** argv)
    {
    return ListBlastDatabaseContent().AppMain(
        argc,
        argv,
        0, /* envp Environment pointer */
        eDS_ToStderr,/* log message. In /c++/include/corelib/ncbidiag.hpp  */
        0, /* Specify registry to load, as per LoadConfig() */
        "loadblastdb" /* Specify application name */
        );
    }

Testing

get some fasta sequences, remove the headers and create a blast database.

$ curl  "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=281381068,281381067,281381066,281381065&rettype=fasta" |\
        awk '/^>/ {printf(">%d\n",NR);next;} {print;}' > sequences.fa

$ makeblastdb -in sequences.fa -dbtype nucl

run the program:

$ loadblastdb -db sequences.fa
>1
GATCGCGGGTATCTTCGATGTAGGGCGAGTCGCGTCACCCTGACCAACCCCTCGAGGTGGTCCTCGTGGAAGGAGGAAGCGCAGTCGACGAAGCGGAAGATGTGGCTGAGCCATCTCGTCGTGTCGGCCGCGAGACGACCCGCGAGTTTCTGCGCGTGTGACTTTGGTAGCGCTACTGCGTAAGCCGAGA
>6
CAGAAACTCGCGGGTCGTCTCGCGGCCGACACGACGAGATGGCTCAGCCACATCTTCCGCTTCGTCGACTGCGCTTCCTCCTTCCACGAGGACCACCTCGAGGGGTTGGTCAGGGTGACGCGACTC
>10
ACCGAAACGGCAAAATTGGCCTTAGTCACGCACACTATCTCGGCTTACGCAGTAGCGCTACCAAAGTCACACGCGCAGAAACTCGCGGGTCGTCTCGCGGCCGACACGACGAGATGGCTCAGCCACATCTTCCGCTTCGTCGACTGCGCTTCCTCCTTCCACGAGGACCACCTCGAGGGGTTGGTCAGGGTGACGCGACTCGCCCTACATCGAAGATACCCGCGATC
>16
GATCGAACGATAAACTTCTGCTATCGGTTCCTCAGCAATCAGTATAACGAGCGGCACATGTATTGCGGCATATTGCAAAAAACATACCTATAAAGAAGGCTAAACTGATTGATTCAAATTAACTTGAGAATACACGATGTTGATGTTCAGTCGAAACAAATCGCTCGCGTATTGTGATC

That's it,
Pierre

2 comments:

Peter Cock said...

Is this intended as an educational example or a proof of principle? You can already do this particular task with the NCBI's blastdbcmd tool.

Pierre Lindenbaum said...

Hi Peter, I'm just learning the API ...