15 October 2012

Modifying the GATK so it supports a XML-based format for VCF.

I've modified the sources of the GATK in order to support a XML-based format for the variation in addition of the VCF format.
Here are the sources I've modified or added:
new file: org.broadinstitute.sting.utils.variantcontext.writer:

package org.broadinstitute.sting.utils.variantcontext.writer.AbstractVCFWriter;

import java.io.File;
import java.io.OutputStream;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;

import net.sf.samtools.SAMSequenceDictionary;

public abstract class AbstractVCFWriter
 extends IndexingVariantContextWriter
 {
    // the VCF header we're storing
 protected VCFHeader mHeader = null;

 protected IntGenotypeFieldAccessors intGenotypeFieldAccessors = new IntGenotypeFieldAccessors();

    // should we write genotypes or just sites?
    final protected boolean doNotWriteGenotypes;
    final protected boolean allowMissingFieldsInHeader;

 
    protected AbstractVCFWriter(
      final File location,
      final OutputStream output,
      final SAMSequenceDictionary refDict,
            final boolean enableOnTheFlyIndexing,
            boolean doNotWriteGenotypes,
            final boolean allowMissingFieldsInHeader
            )
     {
     super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
        this.doNotWriteGenotypes = doNotWriteGenotypes;
        this.allowMissingFieldsInHeader = allowMissingFieldsInHeader;

     }
    
    protected VCFHeader getVCFHeader()
     {
     return this.mHeader;
     }
    
   protected static Map<Allele, String> buildAlleleMap(final VariantContext vc) {
        final Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size()+1);
        alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup

        final List<Allele> alleles = vc.getAlleles();
        for ( int i = 0; i < alleles.size(); i++ ) {
            alleleMap.put(alleles.get(i), String.valueOf(i));
        }

        return alleleMap;
    }

   
   private static final String QUAL_FORMAT_STRING = "%.2f";
   private static final String QUAL_FORMAT_EXTENSION_TO_TRIM = ".00";

   protected String formatQualValue(double qual) {
       String s = String.format(QUAL_FORMAT_STRING, qual);
       if ( s.endsWith(QUAL_FORMAT_EXTENSION_TO_TRIM) )
           s = s.substring(0, s.length() - QUAL_FORMAT_EXTENSION_TO_TRIM.length());
       return s;
   }

   public static final void missingSampleError(final VariantContext vc, final VCFHeader header) {
       final List<String> badSampleNames = new ArrayList<String>();
       for ( final String x : header.getGenotypeSamples() )
           if ( ! vc.hasGenotype(x) ) badSampleNames.add(x);
       throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects.  Missing samples are " + Utils.join(",", badSampleNames));
   }
   
   protected boolean isMissingValue(String s) {
       // we need to deal with the case that it's a list of missing values
       return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length());
   }

   
   /**
    * Takes a double value and pretty prints it to a String for display
    *
    * Large doubles => gets %.2f style formatting
    * Doubles < 1 / 10 but > 1/100 </>=> get %.3f style formatting
    * Double < 1/100 => %.3e formatting
    * @param d
    * @return
    */
   public static final String formatVCFDouble(final double d) {
       String format;
       if ( d < 1 ) {
           if ( d < 0.01 ) {
               if ( Math.abs(d) >= 1e-20 )
                   format = "%.3e";
               else {
                   // return a zero format
                   return "0.00";
               }
           } else {
               format = "%.3f";
           }
       } else {
           format = "%.2f";
       }

       return String.format(format, d);
   }

   public static String formatVCFField(Object val) {
       String result;
       if ( val == null )
           result = VCFConstants.MISSING_VALUE_v4;
       else if ( val instanceof Double )
           result = formatVCFDouble((Double) val);
       else if ( val instanceof Boolean )
           result = (Boolean)val ? "" : null; // empty string for true, null for false
       else if ( val instanceof List ) {
           result = formatVCFField(((List)val).toArray());
       } else if ( val.getClass().isArray() ) {
           int length = Array.getLength(val);
           if ( length == 0 )
               return formatVCFField(null);
           StringBuffer sb = new StringBuffer(formatVCFField(Array.get(val, 0)));
           for ( int i = 1; i < length; i++) {
               sb.append(",");
               sb.append(formatVCFField(Array.get(val, i)));
           }
           result = sb.toString();
       } else
           result = val.toString();

       return result;
   }

   /**
    * Determine which genotype fields are in use in the genotypes in VC
    * @param vc
    * @return an ordered list of genotype fields in use in VC.  If vc has genotypes this will always include GT first
    */
   public static List<String> calcVCFGenotypeKeys(final VariantContext vc, final VCFHeader header) {
       Set<String> keys = new HashSet<String>();

       boolean sawGoodGT = false;
       boolean sawGoodQual = false;
       boolean sawGenotypeFilter = false;
       boolean sawDP = false;
       boolean sawAD = false;
       boolean sawPL = false;
       for ( final Genotype g : vc.getGenotypes() ) {
           keys.addAll(g.getExtendedAttributes().keySet());
           if ( g.isAvailable() ) sawGoodGT = true;
           if ( g.hasGQ() ) sawGoodQual = true;
           if ( g.hasDP() ) sawDP = true;
           if ( g.hasAD() ) sawAD = true;
           if ( g.hasPL() ) sawPL = true;
           if (g.isFiltered()) sawGenotypeFilter = true;
       }

       if ( sawGoodQual ) keys.add(VCFConstants.GENOTYPE_QUALITY_KEY);
       if ( sawDP ) keys.add(VCFConstants.DEPTH_KEY);
       if ( sawAD ) keys.add(VCFConstants.GENOTYPE_ALLELE_DEPTHS);
       if ( sawPL ) keys.add(VCFConstants.GENOTYPE_PL_KEY);
       if ( sawGenotypeFilter ) keys.add(VCFConstants.GENOTYPE_FILTER_KEY);

       List<String> sortedList = ParsingUtils.sortList(new ArrayList<String>(keys));

       // make sure the GT is first
       if ( sawGoodGT ) {
           List<String> newList = new ArrayList<String>(sortedList.size()+1);
           newList.add(VCFConstants.GENOTYPE_KEY);
           newList.addAll(sortedList);
           sortedList = newList;
       }

       if ( sortedList.isEmpty() && header.hasGenotypingData() ) {
           // this needs to be done in case all samples are no-calls
           return Collections.singletonList(VCFConstants.GENOTYPE_KEY);
       } else {
           return sortedList;
       }
   }


   private static int countOccurrences(char c, String s) {
          int count = 0;
          for (int i = 0; i < s.length(); i++) {
              count += s.charAt(i) == c ? 1 : 0;
          }
          return count;
   }


 }

modified file: org.broadinstitute.sting.utils.variantcontext.writer.VCFWriter:
/*
 * Copyright (c) 2010, The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

package org.broadinstitute.sting.utils.variantcontext.writer;

import net.sf.samtools.SAMSequenceDictionary;
import org.broad.tribble.TribbleException;
import org.broad.tribble.util.ParsingUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.*;

import java.io.*;
import java.util.*;

/**
 * this class writes VCF files
 */
class VCFWriter extends AbstractVCFWriter {
    private final static String VERSION_LINE = VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_1.getFormatString() + "=" + VCFHeaderVersion.VCF4_1.getVersionString();

    // the print stream we're writing to
    final protected BufferedWriter mWriter;




    public VCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict,
                     final boolean enableOnTheFlyIndexing,
                     boolean doNotWriteGenotypes,
                     final boolean allowMissingFieldsInHeader )
     {
        super(location, output, refDict, enableOnTheFlyIndexing,doNotWriteGenotypes,allowMissingFieldsInHeader);
        mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size
     }

    // --------------------------------------------------------------------------------
    //
    // VCFWriter interface functions
    //
    // --------------------------------------------------------------------------------

    @Override
    public void writeHeader(VCFHeader header) {
        // note we need to update the mHeader object after this call because they header
        // may have genotypes trimmed out of it, if doNotWriteGenotypes is true
        super.mHeader = writeHeader(header, mWriter, doNotWriteGenotypes, getVersionLine(), getStreamName());
     }

    public static final String getVersionLine() {
        return VERSION_LINE;
    }

    public static VCFHeader writeHeader(VCFHeader header,
                                        final Writer writer,
                                        final boolean doNotWriteGenotypes,
                                        final String versionLine,
                                        final String streamNameForError) {
        header = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : header;
        
        try {
            // the file format field needs to be written first
            writer.write(versionLine + "\n");

            for ( VCFHeaderLine line : header.getMetaDataInSortedOrder() ) {
                if ( VCFHeaderVersion.isFormatString(line.getKey()) )
                    continue;

                writer.write(VCFHeader.METADATA_INDICATOR);
                writer.write(line.toString());
                writer.write("\n");
            }

            // write out the column line
            writer.write(VCFHeader.HEADER_INDICATOR);
            boolean isFirst = true;
            for ( VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) {
                if ( isFirst )
                    isFirst = false; // don't write out a field separator
                else
                    writer.write(VCFConstants.FIELD_SEPARATOR);
                writer.write(field.toString());
            }

            if ( header.hasGenotypingData() ) {
                writer.write(VCFConstants.FIELD_SEPARATOR);
                writer.write("FORMAT");
                for ( String sample : header.getGenotypeSamples() ) {
                    writer.write(VCFConstants.FIELD_SEPARATOR);
                    writer.write(sample);
                }
            }

            writer.write("\n");
            writer.flush();  // necessary so that writing to an output stream will work
        }
        catch (IOException e) {
            throw new ReviewedStingException("IOException writing the VCF header to " + streamNameForError, e);
        }

        return header;
    }

    /**
     * attempt to close the VCF file
     */
    @Override
    public void close() {
        // try to close the vcf stream
        try {
            mWriter.flush();
            mWriter.close();
        } catch (IOException e) {
            throw new ReviewedStingException("Unable to close " + getStreamName(), e);
        }

        super.close();
    }

    /**
     * add a record to the file
     *
     * @param vc      the Variant Context object
     */
    @Override
    public void add(VariantContext vc) {
        if ( mHeader == null )
            throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName());

        if ( doNotWriteGenotypes )
            vc = new VariantContextBuilder(vc).noGenotypes().make();

        try {
            super.add(vc);

            Map<Allele, String> alleleMap = buildAlleleMap(vc);

            // CHROM
            mWriter.write(vc.getChr());
            mWriter.write(VCFConstants.FIELD_SEPARATOR);

            // POS
            mWriter.write(String.valueOf(vc.getStart()));
            mWriter.write(VCFConstants.FIELD_SEPARATOR);

            // ID
            String ID = vc.getID();
            mWriter.write(ID);
            mWriter.write(VCFConstants.FIELD_SEPARATOR);

            // REF
            String refString = vc.getReference().getDisplayString();
            mWriter.write(refString);
            mWriter.write(VCFConstants.FIELD_SEPARATOR);

            // ALT
            if ( vc.isVariant() ) {
                Allele altAllele = vc.getAlternateAllele(0);
                String alt = altAllele.getDisplayString();
                mWriter.write(alt);

                for (int i = 1; i < vc.getAlternateAlleles().size(); i++) {
                    altAllele = vc.getAlternateAllele(i);
                    alt = altAllele.getDisplayString();
                    mWriter.write(",");
                    mWriter.write(alt);
                }
            } else {
                mWriter.write(VCFConstants.EMPTY_ALTERNATE_ALLELE_FIELD);
            }
            mWriter.write(VCFConstants.FIELD_SEPARATOR);

            // QUAL
            if ( !vc.hasLog10PError() )
                mWriter.write(VCFConstants.MISSING_VALUE_v4);
            else
                mWriter.write(formatQualValue(vc.getPhredScaledQual()));
            mWriter.write(VCFConstants.FIELD_SEPARATOR);

            // FILTER
            String filters = getFilterString(vc);
            mWriter.write(filters);
            mWriter.write(VCFConstants.FIELD_SEPARATOR);

            // INFO
            Map<String, String> infoFields = new TreeMap<String, String>();
            for ( Map.Entry<String, Object> field : vc.getAttributes().entrySet() ) {
                String key = field.getKey();

                if ( ! mHeader.hasInfoLine(key) )
                    fieldIsMissingFromHeaderError(vc, key, "INFO");

                String outputValue = formatVCFField(field.getValue());
                if ( outputValue != null )
                    infoFields.put(key, outputValue);
            }
            writeInfoString(infoFields);

            // FORMAT
            final GenotypesContext gc = vc.getGenotypes();
            if ( gc.isLazyWithData() && ((LazyGenotypesContext)gc).getUnparsedGenotypeData() instanceof String ) {
                mWriter.write(VCFConstants.FIELD_SEPARATOR);
                mWriter.write(((LazyGenotypesContext)gc).getUnparsedGenotypeData().toString());
            } else {
                List<String> genotypeAttributeKeys = calcVCFGenotypeKeys(vc, mHeader);
                if ( ! genotypeAttributeKeys.isEmpty() ) {
                    for ( final String format : genotypeAttributeKeys )
                        if ( ! mHeader.hasFormatLine(format) )
                            fieldIsMissingFromHeaderError(vc, format, "FORMAT");

                    final String genotypeFormatString = ParsingUtils.join(VCFConstants.GENOTYPE_FIELD_SEPARATOR, genotypeAttributeKeys);

                    mWriter.write(VCFConstants.FIELD_SEPARATOR);
                    mWriter.write(genotypeFormatString);

                    addGenotypeData(vc, alleleMap, genotypeAttributeKeys);
                }
            }
            
            mWriter.write("\n");
            mWriter.flush();  // necessary so that writing to an output stream will work
        } catch (IOException e) {
            throw new RuntimeException("Unable to write the VCF object to " + getStreamName());
        }
    }


    // --------------------------------------------------------------------------------
    //
    // implementation functions
    //
    // --------------------------------------------------------------------------------

    private final String getFilterString(final VariantContext vc) {
        if ( vc.isFiltered() ) {
            for ( final String filter : vc.getFilters() )
                if ( ! mHeader.hasFilterLine(filter) )
                    fieldIsMissingFromHeaderError(vc, filter, "FILTER");

            return ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters()));
        }
        else if ( vc.filtersWereApplied() )
            return VCFConstants.PASSES_FILTERS_v4;
        else
            return VCFConstants.UNFILTERED;
    }


    /**
     * create the info string; assumes that no values are null
     *
     * @param infoFields a map of info fields
     * @throws IOException for writer
     */
    private void writeInfoString(Map<String, String> infoFields) throws IOException {
        if ( infoFields.isEmpty() ) {
            mWriter.write(VCFConstants.EMPTY_INFO_FIELD);
            return;
        }

        boolean isFirst = true;
        for ( Map.Entry<String, String> entry : infoFields.entrySet() ) {
            if ( isFirst )
                isFirst = false;
            else
                mWriter.write(VCFConstants.INFO_FIELD_SEPARATOR);

            String key = entry.getKey();
            mWriter.write(key);

            if ( !entry.getValue().equals("") ) {
                VCFInfoHeaderLine metaData = mHeader.getInfoHeaderLine(key);
                if ( metaData == null || metaData.getCountType() != VCFHeaderLineCount.INTEGER || metaData.getCount() != 0 ) {
                    mWriter.write("=");
                    mWriter.write(entry.getValue());
                }
            }
        }
    }

    /**
     * add the genotype data
     *
     * @param vc                     the variant context
     * @param genotypeFormatKeys  Genotype formatting string
     * @param alleleMap              alleles for this context
     * @throws IOException for writer
     */
    private void addGenotypeData(VariantContext vc, Map<Allele, String> alleleMap, List<String> genotypeFormatKeys)
    throws IOException {
        for ( String sample : mHeader.getGenotypeSamples() ) {
            mWriter.write(VCFConstants.FIELD_SEPARATOR);

            Genotype g = vc.getGenotype(sample);
            if ( g == null ) {
                missingSampleError(vc, mHeader);
            }

            final List<String> attrs = new ArrayList<String>(genotypeFormatKeys.size());
            for ( String field : genotypeFormatKeys ) {
                if ( field.equals(VCFConstants.GENOTYPE_KEY) ) {
                    if ( !g.isAvailable() ) {
                        throw new ReviewedStingException("GTs cannot be missing for some samples if they are available for others in the record");
                    }

                    writeAllele(g.getAllele(0), alleleMap);
                    for (int i = 1; i < g.getPloidy(); i++) {
                        mWriter.write(g.isPhased() ? VCFConstants.PHASED : VCFConstants.UNPHASED);
                        writeAllele(g.getAllele(i), alleleMap);
                    }

                    continue;
                } else {
                    String outputValue;
                    if ( field.equals(VCFConstants.GENOTYPE_FILTER_KEY ) ) {
                        outputValue = g.isFiltered() ? g.getFilters() : VCFConstants.PASSES_FILTERS_v4;
                    } else {
                        final IntGenotypeFieldAccessors.Accessor accessor = intGenotypeFieldAccessors.getAccessor(field);
                        if ( accessor != null ) {
                            final int[] intValues = accessor.getValues(g);
                            if ( intValues == null )
                                outputValue = VCFConstants.MISSING_VALUE_v4;
                            else if ( intValues.length == 1 ) // fast path
                                outputValue = Integer.toString(intValues[0]);
                            else {
                                StringBuilder sb = new StringBuilder();
                                sb.append(intValues[0]);
                                for ( int i = 1; i < intValues.length; i++) {
                                    sb.append(",");
                                    sb.append(intValues[i]);
                                }
                                outputValue = sb.toString();
                            }
                        } else {
                            Object val = g.hasExtendedAttribute(field) ? g.getExtendedAttribute(field) : VCFConstants.MISSING_VALUE_v4;

                            VCFFormatHeaderLine metaData = mHeader.getFormatHeaderLine(field);
                            if ( metaData != null ) {
                                int numInFormatField = metaData.getCount(vc);
                                if ( numInFormatField > 1 && val.equals(VCFConstants.MISSING_VALUE_v4) ) {
                                    // If we have a missing field but multiple values are expected, we need to construct a new string with all fields.
                                    // For example, if Number=2, the string has to be ".,."
                                    StringBuilder sb = new StringBuilder(VCFConstants.MISSING_VALUE_v4);
                                    for ( int i = 1; i < numInFormatField; i++ ) {
                                        sb.append(",");
                                        sb.append(VCFConstants.MISSING_VALUE_v4);
                                    }
                                    val = sb.toString();
                                }
                            }

                            // assume that if key is absent, then the given string encoding suffices
                            outputValue = formatVCFField(val);
                        }
                    }

                    if ( outputValue != null )
                        attrs.add(outputValue);
                }
            }

            // strip off trailing missing values
            for (int i = attrs.size()-1; i >= 0; i--) {
                if ( isMissingValue(attrs.get(i)) )
                    attrs.remove(i);
                else
                    break;
            }

            for (int i = 0; i < attrs.size(); i++) {
                if ( i > 0 || genotypeFormatKeys.contains(VCFConstants.GENOTYPE_KEY) )
                    mWriter.write(VCFConstants.GENOTYPE_FIELD_SEPARATOR);
                mWriter.write(attrs.get(i));
            }
        }
    }



    private void writeAllele(Allele allele, Map<Allele, String> alleleMap) throws IOException {
        String encoding = alleleMap.get(allele);
        if ( encoding == null )
            throw new TribbleException.InternalCodecException("Allele " + allele + " is not an allele in the variant context");
        mWriter.write(encoding);
    }

    private final void fieldIsMissingFromHeaderError(final VariantContext vc, final String id, final String field) {
        if ( !allowMissingFieldsInHeader)
            throw new UserException.MalformedVCFHeader("Key " + id + " found in VariantContext field " + field
                    + " at " + vc.getChr() + ":" + vc.getStart()
                    + " but this key isn't defined in the VCFHeader.  The GATK now requires all VCFs to have"
                    + " complete VCF headers by default.  This error can be disabled with the engine argument"
                    + " -U LENIENT_VCF_PROCESSING");
    }
}

modified file: org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory:
/*
 * Copyright (c) 2012, The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

package org.broadinstitute.sting.utils.variantcontext.writer;

import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.exceptions.UserException;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.EnumSet;

import javax.xml.stream.XMLStreamException;

/**
 * Factory methods to create VariantContext writers
 *
 * @author depristo
 * @since 5/12
 */
public class VariantContextWriterFactory {

    public static final EnumSet<Options> DEFAULT_OPTIONS = EnumSet.of(Options.INDEX_ON_THE_FLY);
    public static final EnumSet<Options> NO_OPTIONS = EnumSet.noneOf(Options.class);

    private VariantContextWriterFactory() {}

    public static VariantContextWriter create(final File location, final SAMSequenceDictionary refDict) {
        return create(location, openOutputStream(location), refDict, DEFAULT_OPTIONS);
    }

    public static VariantContextWriter create(final File location, final SAMSequenceDictionary refDict, final EnumSet<Options> options) {
        return create(location, openOutputStream(location), refDict, options);
    }

    public static VariantContextWriter create(final File location,
                                              final OutputStream output,
                                              final SAMSequenceDictionary refDict) {
        return create(location, output, refDict, DEFAULT_OPTIONS);
    }

    public static VariantContextWriter create(final OutputStream output,
                                              final SAMSequenceDictionary refDict,
                                              final EnumSet<Options> options) {
        return create(null, output, refDict, options);
    }

    public static VariantContextWriter create(final File location,
                                              final OutputStream output,
                                              final SAMSequenceDictionary refDict,
                                              final EnumSet<Options> options) {
        final boolean enableBCF = isBCFOutput(location, options);

        if ( enableBCF )
            return new BCF2Writer(location, output, refDict,
                    options.contains(Options.INDEX_ON_THE_FLY),
                    options.contains(Options.DO_NOT_WRITE_GENOTYPES));
        else if(location!=null && location.getName().endsWith(".xml"))
         {
         try {
            return new XMLVariantContextWriter(location, output, refDict,
                    options.contains(Options.INDEX_ON_THE_FLY),
                    options.contains(Options.DO_NOT_WRITE_GENOTYPES),
                    options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER)
                    );
         } catch(XMLStreamException err)
          {
          throw new UserException.CouldNotCreateOutputFile(location, "Unable to create XML writer", err);
          }
         }
        else
         {
            return new VCFWriter(location, output, refDict,
                    options.contains(Options.INDEX_ON_THE_FLY),
                    options.contains(Options.DO_NOT_WRITE_GENOTYPES),
                    options.contains(Options.ALLOW_MISSING_FIELDS_IN_HEADER));
         }
     } 

    /**
     * Should we output a BCF file based solely on the name of the file at location?
     *
     * @param location
     * @return
     */
    public static boolean isBCFOutput(final File location) {
        return isBCFOutput(location, EnumSet.noneOf(Options.class));
    }

    public static boolean isBCFOutput(final File location, final EnumSet<Options> options) {
        return options.contains(Options.FORCE_BCF) || (location != null && location.getName().contains(".bcf"));
    }

    public static VariantContextWriter sortOnTheFly(final VariantContextWriter innerWriter, int maxCachingStartDistance) {
        return sortOnTheFly(innerWriter, maxCachingStartDistance, false);
    }

    public static VariantContextWriter sortOnTheFly(final VariantContextWriter innerWriter, int maxCachingStartDistance, boolean takeOwnershipOfInner) {
        return new SortingVariantContextWriter(innerWriter, maxCachingStartDistance, takeOwnershipOfInner);
    }

    /**
     * Returns a output stream writing to location, or throws a UserException if this fails
     * @param location
     * @return
     */
    protected static OutputStream openOutputStream(final File location) {
        try {
            return new FileOutputStream(location);
        } catch (FileNotFoundException e) {
            throw new UserException.CouldNotCreateOutputFile(location, "Unable to create VCF writer", e);
        }
    }
}

new file: org.broadinstitute.sting.utils.variantcontext.writer.XMLVariantContextWriter:
package org.broadinstitute.sting.utils.variantcontext.writer;

import java.io.File;
import java.io.OutputStream;
import java.util.Map;

import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;

import org.broadinstitute.sting.utils.codecs.vcf.VCFContigHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFilterHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderVersion;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;

import net.sf.samtools.SAMSequenceDictionary;

public class XMLVariantContextWriter
 extends AbstractVCFWriter
 {
 public final String NS="http://xml.1000genomes.org/";
    // the print stream we're writing to
    final protected XMLStreamWriter mWriter;

 public XMLVariantContextWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict,
            final boolean enableOnTheFlyIndexing,
            boolean doNotWriteGenotypes,
            final boolean allowMissingFieldsInHeader )
   throws XMLStreamException
  {
  super(location, output, refDict, enableOnTheFlyIndexing,doNotWriteGenotypes,allowMissingFieldsInHeader);
  XMLOutputFactory factory=XMLOutputFactory.newInstance();
  this.mWriter=factory.createXMLStreamWriter(super.getOutputStream());
  }

 protected void writeMetaData(String key,String value)
  throws XMLStreamException
  {
  if(value!=null)
   {
      this.mWriter.writeStartElement("metadata");
      this.mWriter.writeAttribute("key",key);
      this.mWriter.writeCharacters(value);
      this.mWriter.writeEndElement();
   }
  else
   {
      this.mWriter.writeEmptyElement("metadata");
      this.mWriter.writeAttribute("key",key);
   }
  }
 
 @Override
 public void writeHeader(VCFHeader header)
  {
   
     //
     
        header = doNotWriteGenotypes ? new VCFHeader(header.getMetaDataInSortedOrder()) : header;
        
        try {
         this.mWriter.writeStartElement("vcf");
         this.mWriter.writeAttribute("xmlns", NS);
         this.mWriter.writeStartElement("head");
         
         writeMetaData(
           VCFHeaderVersion.VCF4_1.getFormatString(),
           VCFHeaderVersion.VCF4_1.getVersionString()
           );
         //INFO
         this.mWriter.writeStartElement("info-list");
         for ( VCFInfoHeaderLine line : header.getInfoHeaderLines() )
            {
             this.mWriter.writeStartElement("info");
             this.mWriter.writeAttribute("ID",line.getID());
             this.mWriter.writeAttribute("type",line.getType().name());
             if(line.isFixedCount()) this.mWriter.writeAttribute("count",String.valueOf(line.getCount()));
             this.mWriter.writeCharacters(line.getDescription());
             this.mWriter.writeEndElement();
            }
         this.mWriter.writeEndElement();
         
         //FORMAT
         this.mWriter.writeStartElement("format-list");
         for ( VCFFormatHeaderLine line : header.getFormatHeaderLines() )
            {
             this.mWriter.writeStartElement("format");
             this.mWriter.writeAttribute("ID",line.getID());
             this.mWriter.writeAttribute("type",line.getType().name());
             if(line.isFixedCount()) this.mWriter.writeAttribute("count",String.valueOf(line.getCount()));
             this.mWriter.writeCharacters(line.getDescription());
             this.mWriter.writeEndElement();
            }
         this.mWriter.writeEndElement();
         
         //FILTER
         this.mWriter.writeStartElement("filters-list");
         for ( VCFFilterHeaderLine line : header.getFilterLines() )
            {
             this.mWriter.writeStartElement("filter");
             this.mWriter.writeAttribute("ID",line.getID());
             this.mWriter.writeCharacters(line.getValue());
             this.mWriter.writeEndElement();
            }
         this.mWriter.writeEndElement();

         //CONTIGS
         this.mWriter.writeStartElement("contigs-list");
         for ( VCFContigHeaderLine line : header.getContigLines() )
            {
             this.mWriter.writeStartElement("contig");
             this.mWriter.writeAttribute("ID",line.getID());
             this.mWriter.writeAttribute("index",String.valueOf(line.getContigIndex()));
             this.mWriter.writeEndElement();
            }
         this.mWriter.writeEndElement();
         
         //SAMPLES
         this.mWriter.writeStartElement("samples-list");
         for (int i=0;i< header.getSampleNamesInOrder().size();++i )
            {
             this.mWriter.writeStartElement("sample");
             this.mWriter.writeAttribute("id",String.valueOf(i+1));
             this.mWriter.writeCharacters(header.getSampleNamesInOrder().get(i));
             this.mWriter.writeEndElement();
            }
         this.mWriter.writeEndElement();

         this.mWriter.writeEndElement();//head
         this.mWriter.writeStartElement("body");
         this.mWriter.writeStartElement("variations");
        }
        catch (XMLStreamException e)
      {
         throw new ReviewedStingException("IOException writing the VCF/XML header to " + super.getStreamName(), e);
      }

     }
 
 @Override
    public void add(VariantContext vc)
     { 
        try
         {
         super.add(vc);

            Map<Allele, String> alleleMap = buildAlleleMap(vc);
             
         this.mWriter.writeStartElement("variation");
         
         this.mWriter.writeAttribute("chrom",vc.getChr());
         this.mWriter.writeAttribute("pos",String.valueOf(vc.getStart()));

         
            String ID = vc.getID();
            if(!(ID==null || ID.isEmpty() || ID.equals(".")))
             {
             this.mWriter.writeStartElement("id");
             this.mWriter.writeCharacters(ID);
             this.mWriter.writeEndElement();//id
             }
            
         this.mWriter.writeStartElement("id");
         this.mWriter.writeCharacters(ID);
         this.mWriter.writeEndElement();//body

         this.mWriter.writeStartElement("ref");
         this.mWriter.writeCharacters( vc.getReference().getDisplayString());
         this.mWriter.writeEndElement();


         if ( vc.isVariant() )
          {
                for (int i = 0; i < vc.getAlternateAlleles().size(); i++)
                 {
                    Allele altAllele = vc.getAlternateAllele(i);
                 this.mWriter.writeStartElement("alt");
                 this.mWriter.writeCharacters(altAllele.getDisplayString());
                 this.mWriter.writeEndElement();
                 }
          }
         
         
         this.mWriter.writeEndElement();//variation
         }
     catch(XMLStreamException err)
      {
      throw new ReviewedStingException("Cannot close XMLStream",err);
      }
  }

 
 @Override
    public void close()
     {
        super.close();
        try
         {
         this.mWriter.writeEndElement();//variations
         this.mWriter.writeEndElement();//body
         this.mWriter.writeEndElement();//vcf
         this.mWriter.flush();
         this.mWriter.close();
         }
        catch(XMLStreamException err)
         {
         throw new ReviewedStingException("Cannot close XMLStream",err);
         }
        try
         {
         getOutputStream().close();
         }
       catch(Throwable err)
         {
         throw new ReviewedStingException("Cannot close ouputstream",err);
         }
     }
 }

Compiling and testing

With my version, if the filename ends with "*.xml", the XML-Writer is used instead of the standard VCF-writer.
$ ant
(...)
java -jar dist/GenomeAnalysisTK.jar  \
   -T UnifiedGenotyper \
   -o ex1f.vcf.xml \
   -R ex1.fa \
   -I sorted.bam

INFO  17:12:28,358 HelpFormatter - ---------------------------------------------------------------------------------------------------------- 
INFO  17:12:28,361 HelpFormatter - The Genome Analysis Toolkit (GATK) vdbffd2fa3e7a043a6951d8ac58dd619e68a6caa8, Compiled 2012/10/15 16:53:32 
INFO  17:12:28,361 HelpFormatter - Copyright (c) 2010 The Broad Institute 
INFO  17:12:28,361 HelpFormatter - For support and documentation go to http://www.broadinstitute.org/gatk 
INFO  17:12:28,362 HelpFormatter - Program Args: -T UnifiedGenotyper -o  -o ex1f.vcf.xml -R ex1.fa -I sorted.bam 
INFO  17:12:28,363 HelpFormatter - Date/Time: 2012/10/15 17:12:28 
INFO  17:12:28,364 HelpFormatter - ---------------------------------------------------------------------------------------------------------- 
INFO  17:12:28,364 HelpFormatter - ---------------------------------------------------------------------------------------------------------- 
INFO  17:12:28,392 GenomeAnalysisEngine - Strictness is SILENT 
INFO  17:12:28,430 SAMDataSource$SAMReaders - Initializing SAMRecords in serial 
INFO  17:12:28,444 SAMDataSource$SAMReaders - Done initializing BAM readers: total time 0.01 
INFO  17:12:28,835 TraversalEngine - [INITIALIZATION COMPLETE; TRAVERSAL STARTING] 
INFO  17:12:28,835 TraversalEngine -        Location processed.sites  runtime per.1M.sites completed total.runtime remaining 
INFO  17:12:30,721 TraversalEngine - Total runtime 2.00 secs, 0.03 min, 0.00 hours 
INFO  17:12:30,723 TraversalEngine - 108 reads were filtered out during traversal out of 9921 total (1.09%) 
INFO  17:12:30,727 TraversalEngine -   -> 108 reads (1.09% of total) failing UnmappedReadFilter 

And here is the XML file produced (I've just played with the xml format, handling the INFO and the genotypes for each variation was on my todo list).

xmllint --format   ex1f.vcf.xml

<?xml version="1.0"?>
<vcf xmlns="http://xml.1000genomes.org/">
  <head>
    <metadata key="fileformat">VCFv4.1</metadata>
    <info-list>
      <info ID="FS" type="Float" count="1">Phred-scaled p-value using Fisher's exact test to detect strand bias</info>
      <info ID="AN" type="Integer" count="1">Total number of alleles in called genotypes</info>
      <info ID="BaseQRankSum" type="Float" count="1">Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities</info>
      <info ID="MQ" type="Float" count="1">RMS Mapping Quality</info>
      <info ID="AF" type="Float">Allele Frequency, for each ALT allele, in the same order as listed</info>
       (....)
    </info-list>
    <format-list>
      <format ID="DP" type="Integer" count="1">Approximate read depth (reads with MQ=255 or with bad mates are filtered)</format>
      <format ID="GT" type="String" count="1">Genotype</format>
      <format ID="PL" type="Integer">Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification</format>
      <format ID="GQ" type="Integer" count="1">Genotype Quality</format>
      <format ID="AD" type="Integer">Allelic depths for the ref and alt alleles in the order listed</format>
    </format-list>
    <filters-list>
      <filter ID="LowQual"/>
    </filters-list>
    <contigs-list>
      <contig ID="seq1" index="0"/>
      <contig ID="seq2" index="1"/>
    </contigs-list>
    <samples-list>
      <sample id="1">ex1</sample>
      <sample id="2">ex1b</sample>
    </samples-list>
  </head>
  <body>
    <variations>
      <variation chrom="seq1" pos="285">
        <id>.</id>
        <ref>T</ref>
        <alt>A</alt>
      </variation>
      <variation chrom="seq1" pos="287">
        <id>.</id>
        <ref>C</ref>
        <alt>A</alt>
      </variation>
      (....)
  </body>
</vcf>

I've suggested my code to the GATK team but they only want to support the VCF format. I'm so saaaad.


Image via wikipedia


That's it ;-) ,


Pierre

14 October 2012

Calculating time from submission to publication / Degree of burden in submitting a paper

After "404 not found": a database of non-functional resources in the NAR database collection, I've uploaded my second dataset on figshare:
Calculating time from submission to publication / Degree of burden in submitting a paper
.

Calculating time from submission to publication / Degree of burden in submitting a paper. Pierre Lindenbaum,  Ryan Delahanty.
figshare.
Retrieved 10:13, Oct 14, 2012 (GMT)
http://dx.doi.org/10.6084/m9.figshare.96403

This dataset was inspired by this post on biostar, initialy asked by Ryan Delahanty: I was wondering if it would be possible to calculate some kind of a metric for the speed-of-publication for each journal. I'm not sure submitted and accepted dates are available for all papers, but I noticed in XML data there are fields like the following:
<PubmedData>
        <History>
            <PubMedPubDate PubStatus="received">
                <Year>2011</Year>
                <Month>11</Month>
                <Day>29</Day>
                <Hour>6</Hour>
                <Minute>0</Minute>
            </PubMedPubDate>
            <PubMedPubDate PubStatus="accepted">
                <Year>2011</Year>
                <Month>12</Month>
                <Day>20</Day>
                <Hour>6</Hour>
                <Minute>0</Minute>
            </PubMedPubDate>
           (...)

In this dataset, the script 'pubmed.sh" downloads the the journals from http://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.pubmedhelptable45/ , the 'eigenfactors' from http://www.eigenfactor.org.

For each journal , It scans pubmed (starting from year=2000) and get the difference between the date[@PubStatus='received'] and the date[@PubStatus='accepted'].

titleissneigenfactordays
"Acta biochimica Polonica"0001-527X0.003996119.770935960591
"Acta biomaterialia"1742-70610.02152129.682692307692
"Acta biotheoretica"0001-53420.000844161.897058823529
"Acta cirurgica brasileira / Sociedade Brasileira para Desenvolvimento Pesquisa em Cirurgia"0102-86500.00128122.038461538462
"Acta cytologica"0001-55470.00230565.3006134969325
"Acta diabetologica"0940-54290.001851299.6
"Acta haematologica"0001-57920.002825118.654676258993
"Acta histochemica"0065-12810.002162110.471204188482
"Acta histochemica et cytochemica"0044-59910.00067781.6455696202532
"Acta neurochirurgica"0001-62680.009685204.371830985916
"Acta neuropathologica"0001-63220.02347169.7277882797732
"Acta theriologica"0001-70510.000901147.0
"Acta tropica"0001-706X0.01011196.577777777778
"Acta veterinaria Scandinavica"0044-605X0.00161282.0
"Addictive behaviors"0306-46030.017915163.049731182796
"Advances in space research "0273-11770.021217205.0
Ambio0044-74470.007463181.878048780488
"American journal of human genetics"0002-92970.12015667.1898928024502
"American journal of hypertension"0895-70610.017359104.074576271186
(....)

Here is the kind of figure I got:

As far as I remember, "Cell" is the point having the highest eigenfactor.


Note: pubmed contains some errors: e.g. received > accepted (http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=20591334&retmode=xml) or some dates in the future: ( http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=12921703&retmode=xml )


That's it,

Pierre