<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Microbiol.</journal-id>
<journal-title>Frontiers in Microbiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Microbiol.</abbrev-journal-title>
<issn pub-type="epub">1664-302X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmicb.2016.00269</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Microbiology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>GenSeed-HMM: A Tool for Progressive Assembly Using Profile HMMs as Seeds and its Application in <italic>Alpavirinae</italic> Viral Discovery from Metagenomic Data</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Alves</surname> <given-names>Jo&#x000E3;o M. P.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn004"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/286023/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>de Oliveira</surname> <given-names>Andr&#x000E9; L.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn004"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/289301/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Sandberg</surname> <given-names>Tatiana O. M.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/306315/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Moreno-Gallego</surname> <given-names>Jaime L.</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>de Toledo</surname> <given-names>Marcelo A. F.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/325053/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>de Moura</surname> <given-names>Elisabeth M. M.</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/320987/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Oliveira</surname> <given-names>Liliane S.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/309319/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Durham</surname> <given-names>Alan M.</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/40273/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Mehnert</surname> <given-names>Dolores U.</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Zanotto</surname> <given-names>Paolo M. de A.</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Reyes</surname> <given-names>Alejandro</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<xref ref-type="author-notes" rid="fn001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/139775/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Gruber</surname> <given-names>Arthur</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/38749/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Parasitology, Institute of Biomedical Sciences, University of S&#x000E3;o Paulo</institution> <country>S&#x000E3;o Paulo, Brazil</country></aff>
<aff id="aff2"><sup>2</sup><institution>Graduate program in Computational Biology, Universidad de los Andes</institution> <country>Bogot&#x000E1;, Colombia</country></aff>
<aff id="aff3"><sup>3</sup><institution>Department of Microbiology, Institute of Biomedical Sciences, University of S&#x000E3;o Paulo</institution> <country>S&#x000E3;o Paulo, Brazil</country></aff>
<aff id="aff4"><sup>4</sup><institution>Department of Computer Science, Institute of Mathematics and Statistics, University of S&#x000E3;o Paulo</institution> <country>S&#x000E3;o Paulo, Brazil</country></aff>
<aff id="aff5"><sup>5</sup><institution>Department of Biological Sciences, Universidad de los Andes</institution> <country>Bogot&#x000E1;, Colombia</country></aff>
<aff id="aff6"><sup>6</sup><institution>Center for Genome Sciences and Systems Biology, Department of Pathology and Immunology, Washington University in Saint Louis</institution> <country>MO, USA</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Akio Adachi, Tokushima University Graduate School, Japan</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Thierry Candresse, Institut National de la Recherche Agronomique, France; Makoto Kuroda, National Institute of Infectious Diseases, Japan</p></fn>
<fn fn-type="corresp" id="fn001"><p>&#x0002A;Correspondence: Alejandro Reyes <email>a.reyes&#x00040;uniandes.edu.co</email>;</p></fn>
<fn fn-type="corresp" id="fn002"><p>Arthur Gruber <email>argruber&#x00040;usp.br</email></p></fn>
<fn fn-type="other" id="fn003"><p>This article was submitted to Virology, a section of the journal Frontiers in Microbiology</p></fn>
<fn fn-type="other" id="fn004"><p>&#x02020;These authors have contributed equally to the work.</p></fn>
</author-notes>
<pub-date pub-type="epub">
<day>04</day>
<month>03</month>
<year>2016</year>
</pub-date>
<pub-date pub-type="collection">
<year>2016</year>
</pub-date>
<volume>7</volume>
<elocation-id>269</elocation-id>
<history>
<date date-type="received">
<day>29</day>
<month>10</month>
<year>2015</year>
</date>
<date date-type="accepted">
<day>19</day>
<month>02</month>
<year>2016</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2016 Alves, de Oliveira, Sandberg, Moreno-Gallego, de Toledo, de Moura, Oliveira, Durham, Mehnert, Zanotto, Reyes and Gruber.</copyright-statement>
<copyright-year>2016</copyright-year>
<copyright-holder>Alves, de Oliveira, Sandberg, Moreno-Gallego, de Toledo, de Moura, Oliveira, Durham, Mehnert, Zanotto, Reyes and Gruber</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access This is an openaccess article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) or licensor are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>This work reports the development of GenSeed-HMM, a program that implements seed-driven progressive assembly, an approach to reconstruct specific sequences from unassembled data, starting from short nucleotide or protein seed sequences or profile Hidden Markov Models (HMM). The program can use any one of a number of sequence assemblers. Assembly is performed in multiple steps and relatively few reads are used in each cycle, consequently the program demands low computational resources. As a proof-of-concept and to demonstrate the power of HMM-driven progressive assemblies, GenSeed-HMM was applied to metagenomic datasets in the search for diverse ssDNA bacteriophages from the recently described <italic>Alpavirinae</italic> subfamily. Profile HMMs were built using <italic>Alpavirinae</italic>-specific regions from multiple sequence alignments (MSA) using either the viral protein 1 (VP1; major capsid protein) or VP4 (genome replication initiation protein). These profile HMMs were used by GenSeed-HMM (running Newbler assembler) as seeds to reconstruct viral genomes from sequencing datasets of human fecal samples. All contigs obtained were annotated and taxonomically classified using similarity searches and phylogenetic analyses. The most specific profile HMM seed enabled the reconstruction of 45 partial or complete <italic>Alpavirinae</italic> genomic sequences. A comparison with conventional (global) assembly of the same original dataset, using Newbler in a standalone execution, revealed that GenSeed-HMM outperformed global genomic assembly in several metrics employed. This approach is capable of detecting organisms that have not been used in the construction of the profile HMM, which opens up the possibility of diagnosing novel viruses, without previous specific information, constituting a <italic>de novo</italic> diagnosis. Additional applications include, but are not limited to, the specific assembly of extrachromosomal elements such as plastid and mitochondrial genomes from metagenomic data. Profile HMM seeds can also be used to reconstruct specific protein coding genes for gene diversity studies, and to determine all possible gene variants present in a metagenomic sample. Such surveys could be useful to detect the emergence of drug-resistance variants in sensitive environments such as hospitals and animal production facilities, where antibiotics are regularly used. Finally, GenSeed-HMM can be used as an adjunct for gap closure on assembly finishing projects, by using multiple contig ends as anchored seeds.</p>
</abstract>
<kwd-group>
<kwd><italic>Alpavirinae</italic></kwd>
<kwd>sequence assembly</kwd>
<kwd>metagenomic analysis</kwd>
<kwd>viral discovery</kwd>
<kwd><italic>de novo</italic> diagnosis</kwd>
</kwd-group>
<contract-num rid="cn001">Productivity-in-Research fellowships, IC scholarship</contract-num>
<contract-num rid="cn002">DT scholarship</contract-num>
<contract-num rid="cn003">2013/14622-3</contract-num>
<contract-num rid="cn004">FAPA</contract-num>
<contract-sponsor id="cn001">Conselho Nacional de Desenvolvimento Cient&#x000ED;fico e Tecnol&#x000F3;gico<named-content content-type="fundref-id">10.13039/501100003593</named-content></contract-sponsor>
<contract-sponsor id="cn002">Coordena&#x000E7;&#x000E3;o de Aperfei&#x000E7;oamento de Pessoal de N&#x000ED;vel Superior<named-content content-type="fundref-id">10.13039/501100002322</named-content></contract-sponsor>
<contract-sponsor id="cn003">Funda&#x000E7;&#x000E3;o de Amparo &#x000C3; Pesquisa do Estado de S&#x000E3;o Paulo<named-content content-type="fundref-id">10.13039/501100001807</named-content></contract-sponsor>
<contract-sponsor id="cn004">Universidad de los Andes<named-content content-type="fundref-id">10.13039/501100006070</named-content></contract-sponsor>
<counts>
<fig-count count="6"/>
<table-count count="1"/>
<equation-count count="0"/>
<ref-count count="57"/>
<page-count count="15"/>
<word-count count="12326"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>Introduction</title>
<p>From the golden age of phage research establishing the basis for the development of molecular biology, virus research suffered a decline due to several technical difficulties, in particular the necessity of knowing the specific viral and host life cycles and conditions for <italic>in vitro</italic> growth (Rosenberg, <xref ref-type="bibr" rid="B41">2015</xref>). With the advent of next generation sequencing (NGS) and metagenomics, viral discovery and research entered a new successful age. A pioneering metagenome study, a virome of uncultured marine viral communities (Breitbart et al., <xref ref-type="bibr" rid="B5">2002</xref>), revealed a predominance of bacteriophages, and demonstrated the potential of metagenomics in the field of viral research. Since then, viral ecology has risen as a new field, and it is now possible to assess the viral composition of a microbial community and understand the fundamental role that these highly abundant biological entities play in any environment, with particular efforts shown in marine environments (Rohwer and Thurber, <xref ref-type="bibr" rid="B40">2009</xref>). However, since the very start of the metagenomic bloom, it has been clear that our knowledge of viral diversity is scarce and relies on viruses where the host is known and can be cultivated, severely restricting the known viral diversity to possibly less than 1% of what is actually out there (<italic>cf</italic>. Fancello et al., <xref ref-type="bibr" rid="B18">2012</xref>). Furthermore, the rate of shotgun data generation has outpaced the sequencing of reference viral genomes, and this ever-increasing gap limits our capacity to analyze newly generated datasets. Thus, the development of new computational tools is of utmost importance to increase our understanding of viral diversity (Fancello et al., <xref ref-type="bibr" rid="B18">2012</xref>). Some of the most important pandemic diseases arose by the transmission of viruses originally present in animals that were able to adapt to the human host (Wang, <xref ref-type="bibr" rid="B53">2011</xref>; Rosenberg, <xref ref-type="bibr" rid="B41">2015</xref>). Thus, a systematic surveillance for emerging viruses is crucial to enable the detection of novel and potentially devastating ones before they become pandemic (Lipkin and Firth, <xref ref-type="bibr" rid="B24">2013</xref>; Smits and Osterhaus, <xref ref-type="bibr" rid="B48">2013</xref>).</p>
<p>The human and animal microbiome field has benefited immensely from the advances in NGS and metagenomics (Tang and Chiu, <xref ref-type="bibr" rid="B51">2010</xref>; Bexfield and Kellam, <xref ref-type="bibr" rid="B3">2011</xref>). The number of studies characterizing the gut microbiome has increased exponentially in recent years, and such studies have linked changes in these complex communities to diseases ranging from obesity and malnutrition to even Alzheimer&#x00027;s and autism (Mayer et al., <xref ref-type="bibr" rid="B26">2014</xref>). An important component of this microbial community is the viral one, in particular phages that are an integral part of the community (Reyes et al., <xref ref-type="bibr" rid="B37">2010</xref>, <xref ref-type="bibr" rid="B38">2012</xref>, <xref ref-type="bibr" rid="B36">2015</xref>; Dutilh et al., <xref ref-type="bibr" rid="B14">2014</xref>; Norman et al., <xref ref-type="bibr" rid="B30">2015</xref>). Since the early studies of the viral component of the gut microbiota, an important limitation has been the lack of reference viral genomes infecting the Firmicutes and Bacteroidetes, which constitute the most abundant bacterial phyla inhabiting the gut (Arumugam et al., <xref ref-type="bibr" rid="B1">2011</xref>) Bacteriophages are gaining growing relevance in gut microbiome studies where changes in viral and phage population have been linked to alterations in the microbial community and/or human health (Norman et al., <xref ref-type="bibr" rid="B30">2015</xref>; Reyes et al., <xref ref-type="bibr" rid="B36">2015</xref>).</p>
<p><italic>Alpavirinae</italic>, a recently characterized subfamily of the <italic>Microviridae</italic> family, is composed of ssDNA phages that exist either as temperate phages of Bacteroidetes genomes (Kim et al., <xref ref-type="bibr" rid="B20">2011</xref>; Krupovic and Forterre, <xref ref-type="bibr" rid="B22">2011</xref>) or infectious particles (Roux et al., <xref ref-type="bibr" rid="B42">2012</xref>; Zhong et al., <xref ref-type="bibr" rid="B57">2015</xref>). Roux et al. (<xref ref-type="bibr" rid="B42">2012</xref>) analyzed metagenomic data from different geographic locations and biological sources, and described a large set of complete, previously undescribed <italic>Microviridae</italic> genomes, including 33 <italic>Alpavirinae</italic> genomes. More recently, Quaiser et al. (<xref ref-type="bibr" rid="B34">2015</xref>) described 17 additional complete <italic>Microviridae</italic> genomes from a <italic>Sphagnum</italic>-dominated peatland. A recent study (Zhong et al., <xref ref-type="bibr" rid="B57">2015</xref>) reported the occurrence of <italic>Microviridae</italic> in peri-alpine lakes, mainly represented by gokushoviruses, but also including <italic>Alpavirinae</italic>, a finding that confirms that this latter group is also present in fresh waters, possibly in both lysogenic and lytic forms. Cantalupo et al. (<xref ref-type="bibr" rid="B8">2011</xref>) found diverse viral populations in raw sewage, with 80% of the metagenomic reads being related to bacteriophages and, from this subset, 37% were derived from <italic>Microviridae</italic>. Considering that relatively few genomes of the <italic>Alpavirinae</italic> subfamily have been described so far and their initial description as Bacteroidetes associated viruses, this taxonomic group constitutes an interesting case study for a new viral discovery strategy.</p>
<p>One of the most challenging tasks for metagenomic data analysis is the assembly phase (Wajid and Serpedin, <xref ref-type="bibr" rid="B52">2012</xref>; El-Metwally et al., <xref ref-type="bibr" rid="B17">2013</xref>). Several algorithms have been developed and can roughly be classified according to the graph construction method: greedy, OLC (overlap-layout-consensus), and de Bruijn graphs. Assemblers using the OLC method are most appropriate for datasets of relatively long reads, such as Sanger and 454 platforms, but the quadratic complexity of the overlap computation phase severely limits the size of the datasets that can be used. Assemblers using <italic>k</italic>-mers and de Bruijn graphs require much less computational power, but memory requirement is still high. Therefore, whatever the algorithm, sequence assemblers are highly demanding in terms of memory usage and/or processing power, especially for datasets in the magnitude of millions of reads. Additionally, most <italic>de novo</italic> assemblers have been developed for single-organism genome sequencing (Fancello et al., <xref ref-type="bibr" rid="B18">2012</xref>). In fact, <italic>de novo</italic> assembly of metagenomic data is particularly challenging for several reasons, among others: (1) the heterogeneous nature of the sample, with many different organisms; (2) uneven distribution of organism quantities, leading to biased sampling and coverage; (3) unlike single-organism genome sequencing, the number of final assembled sequences cannot be predicted; (4) sequences derived from closely related organisms may generate chimeric assemblies; (5) polymorphisms, in a way similar to sequencing errors, can disrupt assemblies by tangling the assembly graph (i.e., by creating specific topological structures such as tips and bubbles). With those challenges in mind, a few recent attempts have been made to either modify traditional assemblers or develop assemblers specifically designed for metagenomic data (Fancello et al., <xref ref-type="bibr" rid="B18">2012</xref>). However, such approaches still suffer from the same computational resource drawbacks mentioned above for traditional genome assemblers.</p>
<p>Many sequencing projects do not have as a goal the reconstruction of all possible sequences present in a sample, but rather aim at studying a well-defined gene, gene family, or a transcript. In this case, a target-specific assembly could represent a more sensible approach. To fulfill such a need, our group was the first one to develop a seed-driven progressive assembly algorithm, implemented in the GenSeed program (Sobreira and Gruber, <xref ref-type="bibr" rid="B49">2008</xref>), as a rational method to reconstruct specific targets from unassembled sequence datasets. GenSeed uses a short DNA or protein sequence as a query in similarity searches to select reads, which in turn are retrieved from the dataset and assembled together with the seed sequence, leading to an increment of its original length. Short sequences are then extracted from the assembled sequence ends and used as new seeds in an iterative process that generates progressively longer sequences at each assembly cycle. Because assembly is performed in multiple steps and relatively few reads are used in each cycle, the program demands low computational resources. Some recent approaches based on the same concept of seed-driven iterative assembly have been proposed for the assembly of viral sequences from metagenomic data (Smits et al., <xref ref-type="bibr" rid="B47">2015</xref>), but they are all restricted to the use of DNA sequence seeds. In this work, we report the development of GenSeed-HMM, a completely revised and highly incremented version of GenSeed. The proposed approach relies on two principles: (1) progressive assembly as an alternative for sequence reconstruction; and (2) the use of profile HMMs as starting seeds for target-driven reconstruction. As a proof of principle, we use GenSeed-HMM and profile HMMs built from <italic>Alpavirinae</italic> proteins to reconstruct novel viral sequences from human fecal samples. GenSeed-HMM allowed the reconstruction of many <italic>Alpavirinae</italic> genomes distinguishable from those described by Roux et al. (<xref ref-type="bibr" rid="B42">2012</xref>), outperforming conventional (global) genomic assembly in several metrics. GenSeed-HMM provides a fast and simple way to run progressive sequence assembly pipelines that are directly targeted at sequences of interest, potentially detecting members of a taxonomic group related but not equal to those used on the construction of the profile HMM. This feature opens up the possibility of diagnosing novel viruses, without previous specific information.</p>
</sec>
<sec sec-type="materials and methods" id="s2">
<title>Materials and methods</title>
<sec>
<title>Data sources</title>
<p>Two distinct metagenomic datasets were used in this study, derived from fecal microbiota and raw sewage samples. The metagenomic sequence data from fecal microbiota was obtained from monozygotic twins and their mothers, and sequenced on the 454 platform, as previously described (Reyes et al., <xref ref-type="bibr" rid="B37">2010</xref>). Sequence datasets (accession codes <ext-link ext-link-type="DDBJ/EMBL/GenBank" xlink:href="SRX028823">SRX028823</ext-link> to <ext-link ext-link-type="DDBJ/EMBL/GenBank" xlink:href="SRX028827">SRX028827</ext-link>) were downloaded from the Sequence Read Archive (SRA) at <ext-link ext-link-type="uri" xlink:href="http://www.ncbi.nlm.nih.gov/sra">http://www.ncbi.nlm.nih.gov/sra</ext-link>. SRA format files were converted into FASTQ using the <italic>fastq-dump</italic> program (SRA toolkit) and all adaptors were trimmed with <italic>cutadapt</italic> (<ext-link ext-link-type="uri" xlink:href="https://cutadapt.readthedocs.org">https://cutadapt.readthedocs.org</ext-link>) using parameters <monospace>-q</monospace> <monospace>30&#x02013;minimum-length</monospace> <monospace>50 &#x02013;overlap</monospace> &#x0003D; <monospace>5</monospace> <monospace>-u 14</monospace>. Raw sewage (total volume of 15 L) collected at the municipality of Tabo&#x000E3;o da Serra (S&#x000E3;o Paulo, Brazil) was pressure-filtered through an AP-20 filter membrane (Merck Millipore) and electropositive filter membranes Zeta Plus 60 (AMF, Cuno Div.). Viruses were then eluted in a protein mix, concentrated by ultracentrifugation and treated with Vertrel XF (decafluoropentane, DuPont) to remove lipids and proteins (Mehnert and Stewien, <xref ref-type="bibr" rid="B27">1993</xref>; Queiroz et al., <xref ref-type="bibr" rid="B35">2001</xref>). Viral DNA was extracted using DNeasy Blood and Tissue kit (Qiagen&#x000AE;) and amplified with an illustra&#x02122;Single Cell GenomiPhi&#x02122;DNA Amplification Kit (GE Healthcare Life Sciences). The DNA was used to construct a library with the Nextera XT DNA Library Preparation Kit (Illumina, Inc.) and sequenced using the Illumina HiSeq 2500 System, generating 101-bp paired-end reads. To remove the Nextera transposase sequence, FASTQ files were trimmed with <italic>cutadapt</italic> using parameters <monospace>-q 30 -a CTGTCTCTTATACACATCT &#x02013;minimum-length 50</monospace> <monospace>&#x02013;overlap</monospace>&#x0003D;<monospace>5</monospace> <monospace>-u 2</monospace>.</p>
</sec>
<sec>
<title>GenSeed-HMM development and progressive assembly</title>
<p>GenSeed-HMM was developed in the Perl language and is publicly available for download under the terms of the GNU General Public License version 3 at <ext-link ext-link-type="uri" xlink:href="http://genseedhmm.sourceforge.net">http://genseedhmm.sourceforge.net</ext-link>. Installation instructions and documentation are also provided. All tests reported in this work were performed on a Dell PowerEdge T710 server with two Intel Xeon X5660 2.8 Ghz processors and 64 GB of RAM. GenSeed-HMM can be used in any POSIX-compliant operating system such as UNIX and Linux distributions with an installed Perl interpreter (<ext-link ext-link-type="uri" xlink:href="http://www.perl.org">http://www.perl.org</ext-link>). The list of programs required by GenSeed-HMM varies according to the type of seed employed and the assembler that will be used, as well as whether mapping of recruited reads to resulting contigs is desired. For profile HMM seeds, the following packages/programs are required: <italic>transeq</italic> from the EMBOSS package (Rice et al., <xref ref-type="bibr" rid="B39">2000</xref>), BLAST&#x0002B; (Camacho et al., <xref ref-type="bibr" rid="B7">2009</xref>), and HMMER v3.0 (Eddy, <xref ref-type="bibr" rid="B15">2011</xref>). For the optional mapping of recruited reads against resulting contigs, Bowtie2 (Langmead and Salzberg, <xref ref-type="bibr" rid="B23">2012</xref>). GenSeed-HMM requires at least one installed DNA assembler and is compatible with the following programs: SOAPdenovo (Luo et al., <xref ref-type="bibr" rid="B25">2012</xref>), ABySS (Simpson et al., <xref ref-type="bibr" rid="B45">2009</xref>), Velvet (Zerbino and Birney, <xref ref-type="bibr" rid="B55">2008</xref>), Newbler (GS <italic>De Novo</italic> Assembler, Roche 454 Life Sciences, available under request at <ext-link ext-link-type="uri" xlink:href="http://my454.com/contact-us/software-request.asp">http://my454.com/contact-us/software-request.asp</ext-link>), and CAP3 (Huang and Madan, <xref ref-type="bibr" rid="B19">1999</xref>). If Newbler is to be used, programs <italic>sfffile</italic> and <italic>sffinfo</italic> (both distributed by Roche 454 Life Sciences) and <italic>splitter</italic> (from EMBOSS) are also required. Progressive assemblies were performed using GenSeed-HMM. Several parameter sets were tested to optimize assembly results. Parameters used in the final experiments reported here are: <monospace>-assembler newbler -ext_seed_size 30</monospace> <monospace>-max_contig_length 10000 -threads 20 -clean no -mapping yes</monospace> <monospace>-blastn_parameters &#x0201C;-evalue 0.0001 -num_threads 20 -dust no</monospace> <monospace>-perc_identity 85&#x0201D; -no_qual.</monospace> Specific profile HMM seeds (Supplementary File <xref ref-type="supplementary-material" rid="SM1">1</xref>) were used throughout this work and specified on GenSeed-HMM with parameter <monospace>&#x02013;seed</monospace>.</p>
</sec>
<sec>
<title>Profile HMM construction</title>
<p>For profile reconstruction, all available sequences corresponding to previously reported viral proteins (VP) of <italic>Alpavirinae</italic> (Roux et al., <xref ref-type="bibr" rid="B42">2012</xref>), named VP1, VP2, VP3, and VP4, were retrieved. Multiple sequence alignments (MSA) of each group of proteins were created using MUSCLE (Edgar, <xref ref-type="bibr" rid="B16">2004</xref>) with default parameters, and the alignments were manually inspected with Jalview (Waterhouse et al., <xref ref-type="bibr" rid="B54">2009</xref>) to identify conserved regions. The MSA was appended with the respective (VP1, VP2, VP3, or VP4) proteins from <italic>Gokushovirinae</italic> and <italic>Pichovirinae</italic> in order to determine whether identified conserved regions were subfamily specific. Specific regions on VP1 (Supplementary Figure <xref ref-type="supplementary-material" rid="SM3">1</xref>) and VP4 (not shown) were selected and profile HMMs were built using <italic>hmmbuild</italic> from the HMMER package (Eddy, <xref ref-type="bibr" rid="B15">2011</xref>). We adopted a nomenclature composed of the viral protein name (e.g., VP1) plus the region (e.g., R4) of the multiple sequence alignment chosen to build the respective profile HMM used as seed.</p>
</sec>
<sec>
<title>Assembly evaluation and cross-similarity analysis of contigs reconstructed with different profile HMM seeds</title>
<p>Contigs assembled with GenSeed-HMM were analyzed with in-house scripts to list and calculate contig lengths and generate contig size ranks. Contigs reconstructed by progressive assembly using GenSeed-HMM with different profile HMM seeds were sorted in descending order by length and submitted to an all-vs-all <italic>blastn</italic> similarity search. Clusters included contigs presenting at least 90% similarity at the nucleotide level, covering at least 90% of the length of the shortest contig. Contig clusters were used to evaluate consistency between assemblies based on different profile HMM seeds to identify the potential minimum contig set.</p>
</sec>
<sec>
<title>Taxonomic assignment of contigs</title>
<p>For taxonomic assignment of assembled contigs, <italic>blastx</italic> similarity search was used to compare assembled contigs against all reference <italic>Microviridae</italic> proteins (Roux et al., <xref ref-type="bibr" rid="B42">2012</xref>) with a cutoff <italic>E</italic>-value of 1e-20. The top 10 hits were manually checked for consistency and taxonomic assignment was given to the subfamily to which all significant hits were observed. Taxonomic assignment was set to all subfamilies matched in cases where hits with similar scores were obtained to more than one subfamily. Taxonomic assignment to each cluster was done by comparing individual contig assignments within each cluster; for all clusters, we observed 100% agreement in taxonomic classification among the contigs constituting the corresponding cluster.</p>
</sec>
<sec>
<title>Contig distribution from different human samples</title>
<p>Sequence reads derived from each human donor fecal sample (Reyes et al., <xref ref-type="bibr" rid="B37">2010</xref>) were mapped using Bowtie2 (Langmead and Salzberg, <xref ref-type="bibr" rid="B23">2012</xref>) to the assembled contigs assembled by GenSeed-HMM using the VP1R4 seed. Mapping counts were normalized by contig length and sample sequencing effort (RPKM&#x02014;Reads Per Kilobase per Million mapped reads), and log transformed. The resulting matrix was used to generate a heatmap diagram.</p>
</sec>
<sec>
<title>Sequence analysis and annotation</title>
<p>All assembled contigs were submitted to an automatic annotation pipeline using the development version EGene 2, derived from the EGene platform (Durham et al., <xref ref-type="bibr" rid="B13">2005</xref>). The pipeline starts with a gene prediction step using Glimmer 3.02 (Delcher et al., <xref ref-type="bibr" rid="B11">2007</xref>) using a training set composed of <italic>Alpavirinae</italic> proteins (Roux et al., <xref ref-type="bibr" rid="B42">2012</xref>). All translated products were then submitted to <italic>blastp</italic> searches against the non-redundant (nr) database and a database composed of proteins derived from <italic>Microviridae</italic>. Hits were considered positive when presenting <italic>E</italic>-values below 1e-6. Protein domains and families were subsequently identified via InterPro (Mitchell et al., <xref ref-type="bibr" rid="B29">2015</xref>) searches. In the specific case of contig annotation from the VP1R4 assembly, annotation has been manually curated to find missing and/or truncated ORFs. Automatic annotations and all stored evidence for contigs are publicly available at <ext-link ext-link-type="uri" xlink:href="http://www.coccidia.icb.usp.br/alpavirinae">http://www.coccidia.icb.usp.br/alpavirinae</ext-link>.</p>
</sec>
<sec>
<title>Phylogenetic analysis</title>
<p>For each contig assembled using the VP1R4 profile HMM seed, the complete or partial VP1 sequence was identified, translated and used for phylogenetic analyses. Two sets of analyses were done: one using only complete VP1 proteins, while the other used only a conserved region present in all assembled contigs consisting of approximately 75 amino acids having the VP1R4 region at the C-terminus. Each set of proteins was complemented with reference VP1 proteins from published datasets (Roux et al., <xref ref-type="bibr" rid="B42">2012</xref>) and GenBank-deposited datasets (see Supplementary Table <xref ref-type="supplementary-material" rid="SM3">1</xref>) belonging to other <italic>Microviridae</italic> subfamilies: <italic>Gokushovirinae, Pichovirinae</italic>, and genus <italic>Microvirus</italic>. Protein alignments were performed using MUSCLE (Edgar, <xref ref-type="bibr" rid="B16">2004</xref>) and manually edited using Jalview (Waterhouse et al., <xref ref-type="bibr" rid="B54">2009</xref>). Phylogenetic analyses were performed using maximum-likelihood (ML) in RAxML 8.2.0 (Stamatakis, <xref ref-type="bibr" rid="B50">2006</xref>) The best-fitting amino acid substitution model for each set was obtained with ProtTest 3.4 using the AIC statistic for model selection (Darriba et al., <xref ref-type="bibr" rid="B9">2011</xref>). Finally, support for nodes in ML trees was assessed by bootstrap analysis with 100 pseudoreplicates and support values were added to the master ML tree.</p>
</sec>
<sec>
<title>Comparison of progressive vs. global assembly</title>
<p>To compare progressive assemblies with the global assembly counterparts, we ran Newbler as a standalone application, with default parameters, using the complete read datasets for single-end 454 (human fecal samples) and paired-end Illumina (sewage samples) data. For the latter, assembly was performed taking into account paired-end information in order to generate the best possible global assembly. All contig sequences obtained were translated into the six possible reading frames using <italic>transeq</italic> and then used as a dataset for <italic>hmmsearch</italic> (HMMER3 package) using the VP1R4 profile HMM as query. Contigs coding for HMM-positive protein sequences were identified and their nucleotide sequences used for size ranking and comparison to contigs assembled by GenSeed-HMM.</p>
</sec>
<sec>
<title>Coverage analysis</title>
<p>Read alignment (SAM) files produced by GenSeed-HMM were loaded onto Tablet (Milne et al., <xref ref-type="bibr" rid="B28">2013</xref>; <ext-link ext-link-type="uri" xlink:href="https://ics.hutton.ac.uk/tablet/">https://ics.hutton.ac.uk/tablet/</ext-link>) and used to generate base-by-base coverage files for each assembled contig. Coverage information of global assembly was obtained from alignment information files produced by Newbler, and average per-base-coverage for each contig was calculated. VP1R4-containing contigs, derived from the global and progressive assembly, were pooled together and submitted to a <italic>blastn</italic> all-vs-all similarity search. Contigs that were at least 97% identical at the nucleotide level over at least 90% of the length of the shortest contig were clustered.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>Results</title>
<sec>
<title>GenSeed-HMM implementation and execution</title>
<p>GenSeed-HMM is a completely revised and extended version of the previously described GenSeed program (Sobreira and Gruber, <xref ref-type="bibr" rid="B49">2008</xref>). With the advent of next-generation sequencing (NGS) platforms, the ability to use up-to-date sequencing data and DNA assemblers became an essential feature for any sequence reconstruction program. Hence, several improvements over GenSeed&#x00027;s original implementation have been implemented: (1) in addition to CAP3, GenSeed-HMM can now use Newbler, Velvet, SOAPdenovo, or ABySS as third-party assemblers; (2) input formats now include FASTA, FASTA.QUAL, FASTQ, and SFF, including the possibility of using quality values for CAP3 and Newbler; (3) instead of BLAST, GenSeed-HMM now uses BLAST&#x0002B;, a new version of the BLAST suite that uses the NCBI C&#x0002B;&#x0002B; Toolkit and presents several performance and feature improvements; and (4), in addition to DNA and protein sequences, profile HMMs can now be employed as seeds by using HMMER3, a package that performs similarity searches using profile HMMs as queries, with a performance comparable to BLAST. GenSeed-HMM automatically detects seed type (DNA, protein or profile HMM; Figure <xref ref-type="fig" rid="F1">1A</xref>). The program accepts as input a sequencing dataset generated by any of a variety of platforms and, in our experience, GenSeed-HMM can effectively reconstruct sequences using datasets originating from Sanger, 454, or Illumina technologies, with reads as short as 35-bp (data not shown). The dataset format is automatically identified and, if necessary, converted to FASTA. The database for BLAST&#x0002B; is then generated by <italic>makeblastdb</italic> (from the BLAST&#x0002B; package). If a profile HMM is used as a seed, the sequencing dataset is submitted to a six-frame translation using <italic>transeq</italic> (from the EMBOSS package). GenSeed-HMM performs these steps only once and reuses previously generated files in subsequent runs (Figure <xref ref-type="fig" rid="F1">1B</xref>). The progressive assembly cycle (Figure <xref ref-type="fig" rid="F1">1C</xref>) starts either with a similarity search (<italic>blastn</italic> for DNA seeds, <italic>tblastn</italic> for protein seeds) or with a profile search (<italic>hmmsearch</italic> for profile HMM seeds) against the translated sequencing dataset. Whatever the type of similarity search, a list of hits is obtained and used to retrieve all positive reads (and, if applicable, their quality scores) using internal sequence indexer and retriever functions. The reads are then assembled and the contig ends are used as nucleotide seeds for the subsequent assembly round. These sequences, called extension seeds, can have a variable user-defined length compared to the original seed. All assembly steps use the recruited reads combined with the contig sequence from the previous round, to guarantee that previously obtained sequences will not be disrupted by the incorporation of new reads. The use of multiple seeds is implemented in GenSeed-HMM and if two or more growing contigs overlap at a given assembly cycle, the assembler merges them into a newly generated contig. At any cycle there are checkpoints that determine if new reads have been recruited since the last round and if the resulting contigs increased in length compared to the previous round. The progressive assembly process is interrupted if any one of four conditions is satisfied: (1) the contig has reached the optional user-defined maximum length; (2) the optional user-defined number of assembly iterations has been reached; (3) no new read has been recruited by the current extension seeds compared to the preceding round; or (4) no sequence length increment has been observed since the previous round. In this latter case, GenSeed-HMM executes an iterative trimming routine, which may help overcome extension halts caused by sequencing errors. Briefly, the program iteratively trims the ends of the contig, removing an amount of bases corresponding to 25% of the extension seed length at a time (for a maximum of three steps), and tries to repeat the assembly after each trimming phase. If any step succeeds at recruiting new reads and increasing the contig length, the progressive assembly process is resumed. Conversely, the assembly process is finished and GenSeed-HMM proceeds to the final processing and file storing routines (Figure <xref ref-type="fig" rid="F1">1D</xref>). At the final checking procedure, all contigs assembled at the last round are checked for the presence of the original seed, with only seed-positive contigs being stored. Several processing files, including those generated in the intermediate assembly steps can be stored if specified by the user. Since the assembly is progressively generated, no true assembly files (e.g., those listing meaningful contig qualities, graph information, etc.) are produced. Thus, if required by the user, GenSeed-HMM invokes <italic>bowtie2</italic> to map all recruited reads onto the final contigs. A SAM file is then generated and stored, and can be inspected using a graphical viewer for sequence assemblies and alignments such as <italic>tablet</italic> (Milne et al., <xref ref-type="bibr" rid="B28">2013</xref>).</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p><bold>Workflow of the seed-driven progressive assembly process</bold>. GenSeed-HMM automatically identifies the type of starting seed <bold>(A)</bold>. The sequencing read database is indexed and, if needed, translated <bold>(B)</bold>. DNA, protein or profile HMM seeds are then used to select reads from the database using <italic>blastn, tblastn</italic>, or <italic>hmmsearch</italic>, respectively. The list of positive reads is introduced into the progressive assembly cycle <bold>(C)</bold>. The reads retrieved from the database are assembled and the contig ends are extracted and used as new seeds in an iterative process. The progressive assembly contains several checkpoints and is completed when a set of finishing criteria are fulfilled. In the final procedure <bold>(D)</bold>, all contigs are checked in regard to the presence of the starting seed and final files are stored.</p></caption>
<graphic xlink:href="fmicb-07-00269-g0001.tif"/>
</fig>
</sec>
<sec>
<title>Profile HMM design and use in progressive assembly</title>
<p>Since evolutionary processes may impose different selection pressures, proteins may evolve at different rates and even specific domains can present different degrees of conservation. We used GenSeed-HMM in order to identify potential, previously unidentified viruses belonging to the <italic>Alpavirinae</italic> subfamily, recently identified as part of human gut microbial communities. After analyzing the conservation of the <italic>Microviridae</italic> VP1, VP2, VP3, and VP4 proteins, we decided to initially use a dataset of VP1 and VP4 proteins from available <italic>Alpavirinae</italic> assembled genomes (Roux et al., <xref ref-type="bibr" rid="B42">2012</xref>) to identify conserved regions and then, by appending homologous proteins from <italic>Pichovirinae</italic> and <italic>Gokushovirinae</italic>, select regions with specificity to the subfamily <italic>Alpavirinae</italic>. VP1 is the major capsid protein, a highly conserved protein that has been used as a phylogenetic marker of the group, while VP4 is a genome replication initiation protein and is more diverse in sequence than VP1 (Roux et al., <xref ref-type="bibr" rid="B42">2012</xref>). A total of four distinct regions were selected for each of VP1 (Supplementary Figure <xref ref-type="supplementary-material" rid="SM3">1</xref>) and VP4 (not shown) proteins. All profile HMMs were independently tested as seeds in progressive assembly assays using GenSeed-HMM and a dataset derived from viral-like particle (VLP) purification from human fecal samples (Reyes et al., <xref ref-type="bibr" rid="B37">2010</xref>). This dataset is composed of approximately 1.2 million reads generated on the 454 platform, presenting a post-trimming average size of 256 bases. Initially, all contig sets were evaluated by a simple quantitative criterion, considering solely the contig size rank. In the case of VP1 (Figure <xref ref-type="fig" rid="F2">2A</xref>), the VP1R1 profile HMM showed the best performance, with the largest number of long contigs, followed by VP1R4, VP1R5, and VP1R6, respectively. For the VP4 protein (Figure <xref ref-type="fig" rid="F2">2B</xref>), VP4R1, and VP4R3 showed the best results, with VP4R4 and VP4R2 clearly resulting in a much lower number of long-sized contigs. To check the robustness of the method to different NGS technologies, the same profile HMMs were also tested as seeds with a metagenomic dataset derived from raw sewage, composed of 53.5 million Illumina paired-end reads with a post-trimming average size of 92 bases. Either with VP1 (Supplementary Figure <xref ref-type="supplementary-material" rid="SM3">2A</xref>) or VP4 (Supplementary Figure <xref ref-type="supplementary-material" rid="SM3">2B</xref>) profile HMM seeds, the results were very similar to those observed with fecal samples, with VP1R1, VP4R1, and VP4R3 generating longer contigs than the other seeds.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p><bold>Comparison of progressive assembly using different HMM seeds</bold>. Contig profiles obtained by progressive assembly with GenSeed-HMM using a 454 dataset from fecal samples from human patients (Reyes et al., <xref ref-type="bibr" rid="B37">2010</xref>) and profile HMM seeds derived from <italic>Alpavirinae</italic> major capsid protein VP1 <bold>(A)</bold> and replication initiation protein VP4 <bold>(B)</bold>. Contigs are ranked in decreasing order of size. Each marker represents a distinct contig. Profile HMMs used as seeds are depicted.</p></caption>
<graphic xlink:href="fmicb-07-00269-g0002.tif"/>
</fig>
<p>The variability in the sequence reconstruction ability by the different HMM seeds led us to investigate how the different assemblies compared to each other in terms of their contig sequences. Thus, we used the top four performers (VP1R1, VP1R4, VP4R1, and VP4R3) to run <italic>blastn</italic> all-vs-all similarity searches followed by sequence clustering. It is noteworthy that despite the overall contig size rank variation (Figures <xref ref-type="fig" rid="F2">2A,B</xref>), from the total of 85 de-replicated contigs, 25 were identified as being assembled independently by all four assemblies, whereas using either VP1 or VP4 seeds showed the second highest overlap (Figure <xref ref-type="fig" rid="F3">3</xref>). Therefore, highlighting that regardless of the differences observed in contig size rankings, the assemblies were highly consistent, even though they were derived from profile HMMs built from distinct regions and/or proteins. The only other overlap with a significant number of contigs involved nine contigs shared between VP1R1, VP4R3, and VP4R1. However, further taxonomic assignment (see section below) showed that only two of these contigs were assigned to <italic>Alpavirinae</italic>, suggesting lower precision for these seeds.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p><bold>Consistency among HMM seeds</bold>. Venn diagram representing shared contigs reconstructed by progressive assembly using GenSeed-HMM with profile HMM seeds VP1R1, VP1R4, VP4R1, and VP4R3. Contigs were included in the same cluster when presenting at least 90% similarity at the nucleotide level covering at least 90% of the length of the shortest contig. Contigs were then taxonomically classified by <italic>blastx</italic> to reference proteins from <italic>Microviridae</italic> and searched for the presence of the VP1R4 seed using <italic>hmmsearch</italic>. A large percent of shared contigs among all four seeds is observed and belonging to <italic>Alpavirinae</italic> genomes covering the VP1R4 seed. Notice that contigs that were not present within the VP1R4 seed were usually not assigned to <italic>Alpavirinae</italic> (low precision) or do not contain the VP1R4 region, suggesting potential shorter non-overlapping contigs.</p></caption>
<graphic xlink:href="fmicb-07-00269-g0003.tif"/>
</fig>
<p>A further analysis of assembly performance, in particular regarding to VP1R4 seed, showed that some contigs covering this region have not been assembled using the corresponding seed, but rather by one or more other seeds (Figure <xref ref-type="fig" rid="F3">3</xref>). For instance, 14 contigs were assembled exclusively by the VP1R1 seed, with 10 of them being assigned to the <italic>Alpavirinae</italic> subfamily, and four of those covering the VP1R4 region. A detailed analysis of these latter contigs confirmed that the VP1 proteins of this subset were too divergent to be detected by the VP1R4 seed. This phenomenon was observed in all contigs assigned to <italic>Alpavirinae</italic>, but not assembled by the VP1R4 seed (Figure <xref ref-type="fig" rid="F3">3</xref>&#x02014;represented by black numbers). Another interesting observation was the fact that six contigs assigned to <italic>Alpavirinae</italic>, and containing the VP1R4 seed, have not been assembled when using this particular seed (Figure <xref ref-type="fig" rid="F3">3</xref>&#x02014;represented by red numbers). In this case, we identified three events where the VP1R4 seed successfully detected the corresponding reads, but due to a very low coverage on this specific region, Newbler was unable to generate an assembled contig in the first assembly cycle. Finally, for the remaining three events, we identified short sequences on the VP1R4 set of contigs that were very similar (but slightly below our 90% threshold) to contigs assembled by the other seeds. By comparing the read coverage of the shorter contigs with their longer counterparts, it became clear that the ends of the shorter contigs presented lower coverage than the corresponding regions in the longer ones, indicating premature extension stoppage events (data not shown). This seems to be a consequence of the directionality of the progressive assembly method. The assembler is able to extend the growing sequence in one direction, but, due to base discrepancies biased at a particular end of one or more reads, the resulting alignment graph ends up containing a so-called bubble, precluding the assembler from extending the sequence in the opposite direction. By precisely identifying the few different assembly failures, we expect to develop new routines that could automatically handle these problems, should they happen, during an execution.</p>
</sec>
<sec>
<title>Taxonomic assignment of assembled sequences</title>
<p>Given that the aim was to reconstruct <italic>Alpavirinae</italic> genomes from metagenomic datasets, we wanted to address the sensitivity and precision of the methodology. The sensitivity (number of <italic>Alpavirinae</italic> associated contigs from the total number of <italic>Alpavirinae</italic> viruses in a given dataset) and precision (number of <italic>Alpavirinae</italic> associated contigs from the total number of contigs assembled with a given seed) will be dependent on the specific profile HMM seed used, the quality and coverage of the sequencing and the specific parameters used. To address this point we used similarity searches with <italic>blastx</italic> against a reference dataset of <italic>Microviridae</italic> proteins (Roux et al., <xref ref-type="bibr" rid="B42">2012</xref>), together with sequence clustering analysis, and we were able to classify the contigs into three subfamilies of <italic>Microviridae</italic> (Table <xref ref-type="table" rid="T1">1</xref>). Since all HMMs have been originally built toward <italic>Alpavirinae</italic>-conserved regions, a predominance of sequence assignment to this subfamily was expected. In fact, this was the most prevalent taxon of the reconstructed sequences for all seeds. However, with the exception of VP1R4, which presented 100% precision, the three remainder HMMs also led to assembled <italic>Gokushovirinae</italic> and <italic>Pichovirinae</italic> sequences, with precision values varying from 72.3 to 79.7% (Table <xref ref-type="table" rid="T1">1</xref>). The unambiguous taxonomic assignment of VP1R4-derived contigs was confirmed by phylogenetic analysis (see below). Clustering analysis showed that among the four assemblies it was possible to generate a total of 85 non-redundant non-overlapping contigs (Table <xref ref-type="table" rid="T1">1</xref>). However, this result does not necessarily imply that there is a total of 85 different originating viral entities in the sample, since each assembly resulted in a number of partial, shorter contigs centered on the specific profile HMM seed that could be generated from the same virotype but, due to sequencing coverage or other factors affecting assembly, were not extended enough to identify overlaps with contigs produced by other profile HMMs.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p><bold>Taxonomic assignment of contigs (human fecal data, progressive assembly) and classification precision and sensitivity</bold>.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Subfamily</bold></th>
<th valign="top" align="center" colspan="5"><bold>Profile HMM seed</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td/>
<td valign="top" align="center"><bold>VP1R1</bold></td>
<td valign="top" align="center"><bold>VP1R4</bold></td>
<td valign="top" align="center"><bold>VP4R1</bold></td>
<td valign="top" align="center"><bold>VP4R3</bold></td>
<td valign="top" align="center"><bold>Total</bold><xref ref-type="table-fn" rid="TN1"><sup>a</sup></xref></td>
</tr>
<tr>
<td valign="top" align="left"><italic>Alpavirinae</italic></td>
<td valign="top" align="center">47</td>
<td valign="top" align="center">43</td>
<td valign="top" align="center">38</td>
<td valign="top" align="center">34</td>
<td valign="top" align="center">65</td>
</tr>
<tr>
<td valign="top" align="left"><italic>Gokushovirinae</italic></td>
<td valign="top" align="center">11</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">11</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">17</td>
</tr>
<tr>
<td valign="top" align="left"><italic>Pichovirinae</italic></td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">1</td>
</tr>
<tr style="border-bottom: thin solid #000000;">
<td valign="top" align="left">Gokush<italic>/</italic>Alpa<xref ref-type="table-fn" rid="TN2"><sup>b</sup></xref></td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">0</td>
<td valign="top" align="center">1</td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr>
<tr style="border-bottom: thin solid #000000;">
<td valign="top" align="left">Total</td>
<td valign="top" align="center">59</td>
<td valign="top" align="center">43</td>
<td valign="top" align="center">51</td>
<td valign="top" align="center">47</td>
<td valign="top" align="center">85</td>
</tr>
<tr style="border-bottom: thin solid #000000;">
<td valign="top" align="left">VP1R4-positive</td>
<td valign="top" align="center">40</td>
<td valign="top" align="center">43</td>
<td valign="top" align="center">23</td>
<td valign="top" align="center">24</td>
<td valign="top" align="center">49</td>
</tr>
<tr>
<td valign="top" align="left">Sensitivity for <italic>Alpavirinae</italic></td>
<td valign="top" align="center">(47/65) 72.31%</td>
<td valign="top" align="center">(43/65) 66.15%</td>
<td valign="top" align="center">(38/65) 58.46%</td>
<td valign="top" align="center">(34/65) 52.31%</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Precision for <italic>Alpavirinae</italic></td>
<td valign="top" align="center">(47/59) 79.66%</td>
<td valign="top" align="center">(43/43) 100.00%</td>
<td valign="top" align="center">(38/51) 74.51%</td>
<td valign="top" align="center">(34/47) 72.34%</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Sensitivity for VP1R4</td>
<td valign="top" align="center">(40/49) 81.63%</td>
<td valign="top" align="center">(43/49) 87.76%</td>
<td valign="top" align="center">(23/49) 46.94%</td>
<td valign="top" align="center">(24/49) 48.98%</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">Precision for VP1R4</td>
<td valign="top" align="center">(40/59) 67.80%</td>
<td valign="top" align="center">(43/43) 100.00%</td>
<td valign="top" align="center">(23/51) 45.10%</td>
<td valign="top" align="center">(24/47) 51.06%</td>
<td/>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Contigs generated by GenSeed-HMM with the respective profile HMM (VP1R1, VP1R4, VP4R1, and VP4R3) were compared against all reference Microviridae proteins (Roux et al., <xref ref-type="bibr" rid="B42">2012</xref>) using blastx with a cutoff E-value of 1e-20. When hits with similar scores were obtained to more than one subfamily, taxonomic assignment was set to two subfamilies. Contigs were also evaluated for the presence of VP1R4 region by hmmsearch, and the number of positive contigs is shown</italic>.</p>
<fn id="TN1">
<label>a</label>
<p><italic>Total number of De-replicated contigs (See Figure <xref ref-type="fig" rid="F3">3</xref>) that belonged to a given taxonomic assignment</italic>.</p></fn>
<fn id="TN2">
<label>b</label>
<p><italic>Represents a set of two contigs where the best BLAST hit annotation was below the E-value cutoff and they were equally distant by percent identity to Gokushovirinae and Alpavirinae, so no single assignment was possible</italic>.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>Assessing the sensitivity of the different seeds constitutes a challenge since it is impossible to address the real total number of expected <italic>Alpavirinae</italic> genomes. In order to have an approximation to this value we analyzed two different metrics that should constitute an approximate range of the actual sensitivity. As an upper bound, we used the number of total contigs (independently of the seed used) assigned to <italic>Alpavirinae</italic> (<italic>n</italic> &#x0003D; 65; Table <xref ref-type="table" rid="T1">1</xref>), which is very likely to give an over-estimated sensitivity value due to independent contigs formed by different seeds that originate from a single viral entity, as mentioned above. The lower bound was done specifically for the VP1R4 seed and consists of the number of contigs from all assemblies that covered the region used to build the VP1R4 HMM (<italic>n</italic> &#x0003D; 49; Table <xref ref-type="table" rid="T1">1</xref>). By the estimation of these contig numbers it was possible to calculate that the sensitivity for the VP1R4-based assembly should be between 66.2 and 87.8% (Table <xref ref-type="table" rid="T1">1</xref> and Figure <xref ref-type="fig" rid="F3">3</xref>). In a similar way, we calculated the sensitivity and precision of the progressive assembly performed on the sewage data (Supplementary Table <xref ref-type="supplementary-material" rid="SM3">2</xref>) in this case we observed for the VP1R4 seed a similar precision (99.67%) and a sensitivity between (35.8&#x02013;91.6%), the wider range is due to the higher number of total contigs (2480) due to the larger dataset with shorter reads generating a more fragmented assembly.</p>
<p>The advantage of using profile HMMs as seeds for progressive assembly is clear when the same data is investigated by protein similarity searches. With that aim, each of the 33 full-length VP1 sequences from Roux et al. (<xref ref-type="bibr" rid="B42">2012</xref>) was compared by <italic>blastp</italic> similarity searches (Supplementary Table <xref ref-type="supplementary-material" rid="SM3">3</xref>) to our 45 complete or partial VP1 sequences originated from the VP1R4 assembly. Even using an <italic>E</italic>-value of 1e-6, which is not particularly stringent, we have found that each of the 33 proteins matched only 16&#x02013;37 of the 45 novel sequences. This shows that a single profile HMM seed derived from a short VP1 region was much more sensitive than any of the 33 complete protein sequences for the detection of novel <italic>Alpavirinae</italic> sequences. Because these full-length sequences include stretches conserved across proteins from other viral subfamilies, they would probably yield a lower precision. To establish a fair comparison between protein and profile HMM seeds, we assessed the detection ability of GenSeed-HMM using sequences restricted to the VP1R4 seed region (coordinates 799&#x02013;816&#x02014;see Supplementary Table <xref ref-type="supplementary-material" rid="SM3">3</xref>). The observed individual detection rate was much lower indeed, varying from 0 to 4 sequences with a cutoff of 1e-6, and 0&#x02013;15 with a cutoff of 1e-2. Although these tests were performed using <italic>blastp</italic> directly instead of running GenSeed-HMM, they show, in a specific manner, that the nature of the seed is what is leading to a difference in sensitivity. These results indicate that a seed-driven assembly based on a single protein sequence is limited to the information contained on that sequence itself, while profile HMMs, by incorporating the variability of a full set or family of sequences, present higher sensitivity and wider range of detection.</p>
</sec>
<sec>
<title>Using multiple profile HMM seeds</title>
<p>As presented above, no single profile HMM seed was able to assess the true viral complexity of the sample (Table <xref ref-type="table" rid="T1">1</xref> and Figure <xref ref-type="fig" rid="F3">3</xref>). Since GenSeed-HMM can use multiple seeds in a single execution, we decided to run a preliminary comparative analysis to evaluate the ability of single and multiple profile HMMs to reconstruct viral genomes. The profile HMMs were employed either individually or in combination of two or four seeds to progressively assemble sequences from the 454 dataset from human fecal samples (Reyes et al., <xref ref-type="bibr" rid="B37">2010</xref>). All identified <italic>Alpavirinae</italic>-specific contigs were submitted to contig size rankings (Supplementary Figure <xref ref-type="supplementary-material" rid="SM3">3</xref>), with VP1R1 exhibiting the best overall contig size profile, in agreement with what had been previously observed without filtering out contigs belonging to other <italic>Microviridae</italic> subfamilies (Figure <xref ref-type="fig" rid="F2">2</xref>). When using the VP1R4 and VP4R1 seeds, derived from two distinct viral proteins, the obtained profile was clearly better than the profiles observed with the use of any of the individual seeds. The use of pairs of seeds derived from the same protein (e.g., VP1R1/VP1R4 or VP4R1/VP4R3) did not show relevant improvement over individual seeds (data not shown). Nevertheless, it is noteworthy that a combination of the four seeds yielded the best assembly. This result strongly suggests that a rational combination of profile HMM seeds can be used to unravel the true viral diversity in a sample. Is important to highlight that the number of close to full-length contigs (around 6 kb) does not change with multiple seeds, suggesting that the longest contigs were recovered with either one or multiple seeds.</p>
</sec>
<sec>
<title>Phylogenetic analysis</title>
<p>Using an automated processing pipeline, all sequences assembled with the four profile HMMs were annotated. This automatic annotation was the basis for the identification of the VP1 genes and the respective translation to the corresponding protein sequences. Since we have determined that only the VP1R4 assembly generated sequences restricted to the <italic>Alpavirinae</italic> subfamily (see previous section), this particular annotation set was manually curated and used to produce a dataset of complete and partial VP1 protein sequences. For phylogenetic inference, we used a reference dataset of <italic>Microviridae</italic> proteins (Roux et al., <xref ref-type="bibr" rid="B42">2012</xref>) and sequences publicly available on GenBank (Supplementary Table <xref ref-type="supplementary-material" rid="SM3">1</xref>). Since some of the assembled contigs represented incomplete genomes and covered slightly more than the VP1R4 region, we performed two phylogenetic reconstructions using either full-length VP1 proteins or sequences covering approximately 75 amino acids with the VP1R4 region at the C-terminus. The tree containing 28 novel full-length sequences (Figure <xref ref-type="fig" rid="F4">4A</xref>) showed better bootstrap support than that for a tree inferred with 45 shorter sequences (Figure <xref ref-type="fig" rid="F4">4B</xref>), but both converged to the same topology. Both trees clearly separate the different subfamilies and show that all assembled contigs are completely specific to the <italic>Alpavirinae</italic> subfamily, thus corroborating our previous similarity-based taxonomic analysis. In addition, these novel sequences were not confined to a few clades, but rather spread in almost all clades containing reference sequences described by Roux et al. (<xref ref-type="bibr" rid="B42">2012</xref>) and/or available on GenBank suggesting that this subfamily is highly diverse and broadly dispersed in humans.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p><bold>Phylogenetic analysis</bold>. Maximum likelihood phylogenetic analysis of <bold>(A)</bold> full-length VP1 protein and <bold>(B)</bold> a shorter region comprising only the VP1R4 HMM region. Sequences were translated from the contigs reconstructed by GenSeed-HMM using the VP1R4 seed. Different subfamilies of the <italic>Microviridae</italic> family are depicted in distinct colors, references were obtained from Roux et al. (<xref ref-type="bibr" rid="B42">2012</xref>). Branches represented by sequences derived in this work are labeled in black. Asterisks in the nodes indicate bootstrap values higher than 70%. Numbers represent contig numbers as observed in Figure <xref ref-type="fig" rid="F5">5</xref>.</p></caption>
<graphic xlink:href="fmicb-07-00269-g0004.tif"/>
</fig>
</sec>
<sec>
<title>Intra- and inter-personal distribution of novel <italic>Alpavirinae</italic> sequences</title>
<p>Read abundance in contigs reconstructed by progressive assembly with GenSeed-HMM using the VP1R4 seed was used to assess the distribution of the newly characterized <italic>Alpavirinae</italic> sequences across the different human donor fecal samples. The novel <italic>Alpavirinae</italic> sequences showed highly conserved intrapersonal patterns along different time points (Figure <xref ref-type="fig" rid="F5">5</xref>), similarly to what has been previously observed for whole viromes (Reyes et al., <xref ref-type="bibr" rid="B37">2010</xref>). Conversely, interpersonal viral variations were much higher, with few cases of shared contigs even between twins of the same family, except for the twins on family 2. It has been suggested that the <italic>Alpavirinae</italic> subfamily is linked to genera of the Bacteroidetes phylum (Krupovic and Forterre, <xref ref-type="bibr" rid="B22">2011</xref>). Our results show that distinct individuals harbor different amounts of each of these viruses (Figure <xref ref-type="fig" rid="F5">5</xref>), which are usually not closely phylogenetically related (Figure <xref ref-type="fig" rid="F4">4B</xref>), suggesting that they are probably associated with different Bacteroidetes taxa.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p><bold>Read abundance of contigs</bold>. Heatmap diagram representing read abundance in contigs reconstructed by progressive assembly with GenSeed-HMM and the VP1R4 seed. Fecal biospecimens were collected from different families (F1&#x02013;F4) composed of monozygotic twins (T1 and T2) and their respective mothers (M). Time points of sample collection and technical replicates (R) are depicted. Data source: 454 dataset from fecal samples of human patients (Reyes et al., <xref ref-type="bibr" rid="B37">2010</xref>).</p></caption>
<graphic xlink:href="fmicb-07-00269-g0005.tif"/>
</fig>
</sec>
<sec>
<title>Progressive vs. global assembly</title>
<p>To address how progressive assembly performs against conventional global assembly, we compared our contigs generated using GenSeed-HMM with the VP1R4 seed to the results obtained using Newbler in a standalone execution for the same original dataset (global assembly). By selecting only contigs coding for VP1R4-positive proteins (using <italic>hmmsearch</italic>), a fair comparison between both assembly methods could be established. An initial analysis, based on cumulative contig lengths (Figure <xref ref-type="fig" rid="F6">6A</xref>) showed a very similar assembly performance for the 15 longest contigs obtained using the human fecal samples. From this result it can be appreciated that the progressive method clearly had a better assembly performance, characterized by a higher number of assembled bases (169 kb) than the global assembly (148 kb). The total number of contigs was similar for both approaches, with 45 contigs in the case of progressive assembly and 44 with global assembly. When the same test was applied to the Illumina dataset derived from a raw sewage sample (Figure <xref ref-type="fig" rid="F6">6B</xref>), we observed even more pronounced differences. In this case, we obtained a total of 360 kb of assembled sequence comprising 453 contigs, whereas the conventional method showed a more fragmented assembly, with a total of 216 kb and 471 contigs (See Supplementary File <xref ref-type="supplementary-material" rid="SM2">2</xref>). Given the environmental nature of the raw sewage sample, a much wider viral diversity should be expected. In fact, the total number of contigs was much higher than that observed in human fecal samples. To compare the sensitivity and precision obtained with the GenSeed-HMM method and the global assembly, we performed the same taxonomic annotation and clustering analysis and clustering on these contigs. The results (Supplementary Tables <xref ref-type="supplementary-material" rid="SM3">4</xref>, <xref ref-type="supplementary-material" rid="SM3">5</xref>) showed equivalent numbers of sensitivity and precision with the VP1R4 seed, confirming that both strategies have similar ability to recover the viral genomes (both are based on the same assembler) but GenSeed-HMM recovers longer contigs with more efficient use of computational resources and completely centered on the target sequences.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p><bold>Comparison between global and progressive assembly</bold>. Comparison of cumulative contig lengths using progressive assembly with GenSeed-HMM and VP1R4 HMM seed and global assembly with Newbler. Data sources: <bold>(A,C,D)</bold> 454 dataset from fecal samples of human patients (Reyes et al., <xref ref-type="bibr" rid="B38">2012</xref>); <bold>(B)</bold> Illumina dataset from a sewage treatment plant at the municipality of Tabo&#x000E3;o da Serra, S&#x000E3;o Paulo, Brazil (unpublished data). Contigs from progressive assembly and VP1R4-positive contigs from global assembly were clustered at 97% identity over at least 90% of the shortest contig, each cluster consisted at most of one contig from each dataset. A total of 53 clusters were generated, nine unique for the progressive assembly and eight unique for global assembly. Plotted is the comparison in lengths <bold>(C)</bold> and coverage <bold>(D)</bold> for related contigs obtained by progressive and global assemblies and ranked by size.</p></caption>
<graphic xlink:href="fmicb-07-00269-g0006.tif"/>
</fig>
<p>To further characterize the consistency among the results obtained with the different strategies, we used the assembly from fecal samples for a similarity clustering at 97% identity to identify cases where the same contig was found in both assemblies. In this case, we observed that each cluster contains at most one contig from each assembly strategy. In the case of the human fecal samples, we identified a total of 53 non-redundant contigs where nine of those were unique to the progressive assembly and eight were unique to the global assembly. When comparing the contig lengths for each pair of clustered contigs (Figure <xref ref-type="fig" rid="F6">6C</xref>) it was possible to see that in 20 cases both assemblies yielded contigs of essentially the same length, while in 11 cases progressive contigs were longer than the global ones, and in five cases the opposite was observed. These findings confirmed once more that the progressive strategy was mostly capable of generating longer contigs from the same original seed than a global strategy. When comparing contig read coverage (Figure <xref ref-type="fig" rid="F6">6D</xref>) in the same dataset, it was clear that both strategies assemble contigs with similar coverage, suggesting that there is no coverage bias for the contigs assembled with the iterative progressive assembly.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>Discussion</title>
<p>In this work, we describe the development of GenSeed-HMM, a program that implements a seed-driven progressive assembly approach using profile HMMs as seeds, in addition to nucleotide and protein sequences. We also demonstrate the application of the implemented method for viral discovery using <italic>Alpavirinae</italic> as a case study. Using a previously published dataset it was possible to assemble a total of 85 <italic>Microviridae</italic> associated contigs, with 25 of those likely representing full viral genomes. Phylogenetic analysis showed that those novel assembled contigs contained representatives of all known clades of the <italic>Alpavirinae</italic> subfamily, significantly contributing to the knowledge regarding those viruses in the human gut. The use of GenSeed-HMM to assemble <italic>de novo</italic> viral genomes present in metagenomes provides a very important resource for the characterization and understanding of the role of different viruses and viral families in the microbial ecology of complex environments.</p>
<p>The current study also shows that our progressive assembly strategy generates an overall higher number of longer contigs, with read coverage equivalent to that observed in the corresponding global assemblies. This improvement could be due to effects of repetitive regions that can create chimeric contigs or even hamper global assembly, especially if these regions are longer than the average read length. Another potential problem is represented by polymorphic sequences, a feature commonly found in viral populations. In the case of global assembly, reads are analyzed all at once to construct the assembly graphs. Conversely, progressive assembly is driven by a single seed or, in the worst case, a relatively small number of seeds. This means that the search space is dramatically reduced since a very strict subset of reads is selected from the main dataset. Each assembled contig then originates two extension seeds, one from each of the contig&#x00027;s ends, which in turn will be used to select new small subsets of reads. Thus, each assembly round employs these relatively few reads plus the previously generated contig, which acts as a guide for sequence growth. Hence, when a repetitive region already present in a previously assembled part of the contig is reached, no newly recruited reads will disrupt the sequence already assembled. The whole process is therefore highly directional, starting from the seed sequence up to the final optimal assembly. This particular <italic>modus operandi</italic> is important to prevent repetitive sequences from leading to chimeric assemblies, which could entrap the process by artificially joining physically unrelated sequences.</p>
<p>A classical protocol for detecting viral sequences from metagenomic data is to assemble the sequence reads and then submit the contigs to BLAST searches against databases of known viral genomes or protein sequences (Cantalupo et al., <xref ref-type="bibr" rid="B8">2011</xref>; Bibby and Peccia, <xref ref-type="bibr" rid="B4">2013</xref>; Norman et al., <xref ref-type="bibr" rid="B30">2015</xref>). This approach is severely limited by the fact that pairwise sequence comparison methods fail to detect distant evolutionary relationships, with sequence identities of around 30% seeming to represent a threshold value for identifying true homologs (Brenner et al., <xref ref-type="bibr" rid="B6">1998</xref>). In the case of viral discovery, this scenario is even more challenging because of the typically high substitution rates, especially in RNA viruses that replicate through error-prone RNA-dependent RNA polymerases (RdRP). Also, the bias of sequence data available for the different viral families limits the effectiveness of similarity searches. Search methods relying on profiles are more sensitive than pairwise alignments because they incorporate broader position-specific information as well as a quantification of the range of substitutions observed across different members of the group. From several methods available, profile HMMs seem to be the most effective to detect distantly related organisms (Park et al., <xref ref-type="bibr" rid="B32">1998</xref>). More recently, Skewes-Cox et al. (<xref ref-type="bibr" rid="B46">2014</xref>) reported a method to generate viral profile HMMs (vFams) for the detection of viruses from metagenomic data and the public release of a database composed of more than 4000 such profiles (vFam&#x02014;<ext-link ext-link-type="uri" xlink:href="http://derisilab.ucsf.edu/software/vFam/">http://derisilab.ucsf.edu/software/vFam/</ext-link>). These profile HMMs, constructed from MSAs covering the entire sequence of the respective proteins, showed a higher precision than BLAST searches in real metagenomic datasets, especially for more divergent viral sequences. According to the authors, vFams could be used to nucleate metagenomic assemblies with selected reads to produce longer sequences, in an approach similar to the one previously proposed by our group (Sobreira and Gruber, <xref ref-type="bibr" rid="B49">2008</xref>). Another important aspect pointed out by Skewes-Cox et al. (<xref ref-type="bibr" rid="B46">2014</xref>) is the fact that both BLAST and HMM-based methods rely on some degree of similarity to already known viruses, meaning that updating sequence databases in a regular basis is essential for the future effectiveness of such methods, and that bioinformatics approaches based on <italic>de novo</italic> metagenomic assembly and <italic>ab initio</italic> structural prediction algorithms will have increasing importance. In this direction, there is also room to improve seed development with the possible addition of protein structure information in profile HMM design for probing deep phylogenetic associations (Deng and Cheng, <xref ref-type="bibr" rid="B12">2014</xref>).</p>
<p>Compared to the original GenSeed program (Sobreira and Gruber, <xref ref-type="bibr" rid="B49">2008</xref>), the concept of using seeds to drive the assembly process has been extended in GenSeed-HMM by the development of specific routines to deal with profile HMMs. In fact, the originally proposed nucleotide and protein seeds could drive the assembly of sequences derived from the same species or from evolutionarily close organisms. A few previous attempts using our original concepts of seed-driven and/or progressive assembly have been described, but were limited in application to fewer genomic assembly programs, DNA sequence seeds, or non-metagenomic input data (Smits et al., <xref ref-type="bibr" rid="B47">2015</xref>). The original GenSeed program already used both DNA or protein sequences as seeds for iterative assembly, and GenSeed-HMM greatly expands on these capabilities by allowing the use of read data from different sequencing technologies, multiple assemblers, and profile HMM seeds. Tools such as PRICE (Ruby et al., <xref ref-type="bibr" rid="B43">2013</xref>), which also use GenSeed&#x00027;s original assembly principles, are based exclusively on DNA seeds limiting their potential for viral discovery. Indeed, even using protein sequences, which are much more conserved than DNA, the profile HMM seed derived from a short VP1 region (VP1R4) was much more sensitive than any of the 33 complete VP1 protein sequences from Roux et al. (<xref ref-type="bibr" rid="B42">2012</xref>) for the detection of novel <italic>Alpavirinae</italic> sequences. Profile HMMs increase the spectrum of detectable organisms since they are built from MSAs derived from many organisms, encompassing a large range of variability within a single probabilistic model. The use of profile HMMs in a targeted gene assembly tool has been recently implemented on the SAT-Assembler program (Zhang et al., <xref ref-type="bibr" rid="B56">2014</xref>). Using a concept similar to the seed-driven assembly described by our group and implemented in GenSeed (Sobreira and Gruber, <xref ref-type="bibr" rid="B49">2008</xref>), SAT-Assembler uses the seeds to select reads from datasets and then proceeds to construct its own overlap graph for the assembly, also avoiding an all-against-all sequence comparison. However, SAT-Assembler can only generate a consensus sequence that is limited to these reads. Conversely, by means of the progressive assembly method, GenSeed-HMM can extend the sequence reconstruction as much as possible, according to user requirements. This is especially important, since the assembly is not restricted to the gene itself, but also to its flanking regions, providing genomic context information. In fact, by using the appropriate number of assembly cycles, an entire viral (or other episomes, such as mitochondrial) genome can be reconstructed using a single seed, provided that sufficient read coverage is available in the sequencing dataset, as shown in the current study.</p>
<p>When applied to viral discovery, simultaneous use of multiple seeds can substantially increase the sensitivity of the method by generating several starting points for assembly. If maximum sensitivity is required, combining seeds is important, as our results show that no single profile HMM seed can assess the true viral diversity present on any sequencing dataset. However, the proper choice of seeds is essential, since closely placed seeds may be inefficient for two reasons: (1) if the seeds are directed toward physically close regions, chances are that low read coverage may apply to all of them; and (2) because of the physical proximity of the seeds, specific reads recruited by a seed could overlap reads selected by other seeds, implying that the progressive assembly might give rise to something approaching a classical global assembly. Our results show that using seeds derived from different proteins is a more sensible approach. However, it is worth mentioning that using multiple seeds to attain maximum sensitivity may come at the price of lowering precision. A general recommendation for seed design includes avoiding low-complexity regions, as they would result in non-specific reads being recruited and assembled, with a consequent lack of specificity. A good compromise between sequence conservation/divergence of the region selected for profile HMM building may vary from case to case and there is no <italic>a priori</italic> set of rules. Delimiting the range of targeted taxa may help to define whether the profile HMM seeds should be built from selected regions or from a full-length protein sequence. Specific routines could also be implemented in future versions of GenSeed-HMM to identify and discard spurious non-specific sequences. The development of multiple seeds could also profit from a nested, hierarchical-based rationale for seed design and use that should entail aspects of viral taxonomy. For example, one could progressively use sets of seeds, initiating by using replicases, which would then lead to an informed choice of helicase and capsid-derived seeds, and so on. This would drive new virus discovery from core functions, such as replicases and capsid genes (that define viral families) to more contextual functions, such as receptor glycoproteins that would be more informative at the genus level (de Andrade Zanotto and Krakauer, <xref ref-type="bibr" rid="B10">2008</xref>; Krakauer and Zanotto, <xref ref-type="bibr" rid="B21">2008</xref>). We foresee that a rational protocol of profile HMM construction can be established focusing on the development of narrow- and wide-range taxonomic associations. For instance, specific profile HMMs could be built for the detection of well-delimited taxonomic groups such as subfamilies or families.</p>
<p>A paradigm of diagnosis, using either serological or nucleic acid-based methods, is that one can only diagnose organisms that are already known. For instance, given a pathogen to be identified by a serological assay, it is mandatory to first establish which antigens or antibodies will be the targets of detection. Likewise, PCR-based assays rely on previous knowledge of the target sequences to be amplified, and microarray-based assays, such as the Virochip, are based on known hybridization targets. Viruses are biological entities in which evolution can be observed in comparably short spans of time, given their fast rates of mutation and substitution. In fact, since the nineteen-seventies, we have witnessed the emergence of many novel human and animal diseases, such as Acquired Immune Deficiency Syndrome (AIDS) caused by the human immunodeficiency virus (HIV), Ebola virus disease (EVD), among others (Palacios et al., <xref ref-type="bibr" rid="B31">2008</xref>; Wang, <xref ref-type="bibr" rid="B53">2011</xref>; Rosenberg, <xref ref-type="bibr" rid="B41">2015</xref>). Metagenomic data has contributed to surveys of viral diversity (Bibby and Peccia, <xref ref-type="bibr" rid="B4">2013</xref>) and the discovery of novel animal (Bel&#x000E1;k et al., <xref ref-type="bibr" rid="B2">2013</xref>) and human (Tang and Chiu, <xref ref-type="bibr" rid="B51">2010</xref>; Siebrasse et al., <xref ref-type="bibr" rid="B44">2012</xref>; Phan et al., <xref ref-type="bibr" rid="B33">2015</xref>; Reyes et al., <xref ref-type="bibr" rid="B36">2015</xref>) viruses. The pace of viral discovery is increasing, including many emergent zoonotic viruses pathogenic to humans (Wang, <xref ref-type="bibr" rid="B53">2011</xref>; Rosenberg, <xref ref-type="bibr" rid="B41">2015</xref>). Given the ever-growing amount of sequence data, the challenge is how to diagnose new potentially emerging pathogens without knowing what one is looking for. Considering that emerging viruses moved into humans from pre-existing lineages from the zoonotic pool, some key structures are conserved in essential functions such as replication and capsid proteins. HMMs able to potentially detect a wider range of taxa could be used for epidemiological surveillance, in order to monitor the emergence of new variants of already known viruses or even detect the arising of novel viruses. Profile HMMs have a series of advantages that make them ideally suited to detect sequences that have not been sampled in the original MSA within a reasonable margin of divergence, detecting related members to those used for the construction profile that likely share the same selective pressures. This feature opens up a new possibility, namely the diagnosis of novel viruses potentially pathogenic to humans and animals, without previous specific information, an approach that we refer to as <italic>de novo</italic> diagnosis. We believe that <italic>de novo</italic> diagnosis using rationally designed profile HMMs may assume a fundamental importance for epidemiological surveillance in some sentinel sites such as hospitals, sewage treatment stations, animal production facilities, and migratory bird colonies, among others. By detecting emerging viruses on these sites, it would be possible to undertake containment measures to prevent the spread of potentially devastating diseases. GenSeed-HMM provides a fast and simple implementation to run progressive assembly pipelines using profile HMMs covering the most relevant groups of viral pathogens. By combining rational design of profile HMMs and multiple GenSeed-HMM runs, one can foresee a replacement of the paradigm of conventional diagnosis.</p>
<p>In this work we exemplified how GenSeed-HMM could be used for viral discovery. Nonetheless, the spectrum of potential applications of the seed-driven progressive assembly method using profile HMMs is much wider. Besides viral genomes, the method is well fitted for surveys of extra-chromosomal elements such as plastid and mitochondrial genomes from metagenomic data. This is particularly relevant for the exploration of some specific target sequences from largely contaminated datasets such as paleometagenomic samples. Profile HMM seeds can also be used to reconstruct specific protein coding genes for gene diversity studies, thus determining all possible gene variants present in a metagenomic sample, independently of their organism of origin. Such surveys could be useful to detect the emergence of drug-resistant variants in sensitive environments such as hospitals and animal production facilities, where antibiotics are regularly used. In addition, the extra length obtained with iterative progressive assembly of these target-specific sequences could reveal their genomic context, that is, whether they are originated from chromosomal or episomal sources, and surrounded by other genes involved with drug-resistance and/or associated with transposable elements. By using multiple profile HMM seeds, built from proteins from a specific pathway, GenSeed-HMM allows one to assess the occurrence of this pathway in specific environmental metagenomic samples, even if the gene complement is derived from multiple organism sources. Finally, another interesting application is the use of the progressive assembly method as an adjunct for gap closure on assembly finishing projects, by using multiple contig ends as anchored seeds to promote a sequence walking/progressive assembly process in which overlapping sequences can lead to gap closure. Using an in-house script for this specific application, we were able to close around 80% of the gaps of a bacterial sequencing project (data not shown). Concluding, GenSeed-HMM is a multipurpose program under active development, and we envisage its growing application on a variety of forthcoming projects.</p>
</sec>
<sec id="s5">
<title>Author contributions</title>
<p>AG and AR conceived and designed the study. AMD, JA, and PZ contributed to the design of the experiments. AG, AR, ALO, JMG, JA, LO, MT, and TS performed the experiments. AG, AR, ALO, JMG, JA, and TS analyzed the data. DM and EM collected raw sewage samples and generated sequencing data. AG and AR prepared the first draft of the manuscript. JA and PZ participated in the discussion and writing of the manuscript. All authors revised the manuscript and have agreed to the final content.</p>
<sec>
<title>Conflict of interest statement</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
</sec>
</body>
<back>
<ack><p>AG, PZ, and AMD received Productivity-in-Research fellowships from the National Council for Scientific and Technological Development (CNPq). TS received an IC scholarship from PIBIC/CNPq. EM received a DT scholarship from CAPES. JA is supported by grant &#x00023;2013/14622-3, S&#x000E3;o Paulo Research Foundation (FAPESP). AR is supported by FAPA internal funding at Universidad de los Andes. JMG is supported by Young Investigator award from Colciencias and the School of Sciences at Universidad de los Andes. ALO received an MS scholarship from CAPES and S&#x000E3;o Paulo Research Foundation - FAPESP (&#x00023;2010/04609-1).</p>
</ack>
<sec sec-type="supplementary-material" id="s6">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="http://journal.frontiersin.org/article/10.3389/fmicb.2016.00269">http://journal.frontiersin.org/article/10.3389/fmicb.2016.00269</ext-link></p>
<p><supplementary-material xlink:href="DataSheet1.ZIP" id="SM1" mimetype="application/zip" xmlns:xlink="http://www.w3.org/1999/xlink"/></p>
<p><supplementary-material xlink:href="DataSheet2.XLSX" id="SM2" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/></p>
<p><supplementary-material xlink:href="DataSheet3.docx" id="SM3" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/></p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Arumugam</surname> <given-names>M.</given-names></name> <name><surname>Raes</surname> <given-names>J.</given-names></name> <name><surname>Pelletier</surname> <given-names>E.</given-names></name> <name><surname>Le Paslier</surname> <given-names>D.</given-names></name> <name><surname>Yamada</surname> <given-names>T.</given-names></name> <name><surname>Mende</surname> <given-names>D. R.</given-names></name> <etal/></person-group>. (<year>2011</year>). <article-title>Enterotypes of the human gut microbiome</article-title>. <source>Nature</source> <volume>473</volume>, <fpage>174</fpage>&#x02013;<lpage>180</lpage>. <pub-id pub-id-type="doi">10.1038/nature09944</pub-id><pub-id pub-id-type="pmid">21508958</pub-id></citation>
</ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bel&#x000E1;k</surname> <given-names>S.</given-names></name> <name><surname>Karlsson</surname> <given-names>O. E.</given-names></name> <name><surname>Blomstr&#x000F6;m</surname> <given-names>A. L.</given-names></name> <name><surname>Berg</surname> <given-names>M.</given-names></name> <name><surname>Granberg</surname> <given-names>F.</given-names></name></person-group> (<year>2013</year>). <article-title>New viruses in veterinary medicine, detected by metagenomic approaches</article-title>. <source>Vet. Microbiol.</source> <volume>165</volume>, <fpage>95</fpage>&#x02013;<lpage>101</lpage>. <pub-id pub-id-type="doi">10.1016/j.vetmic.2013.01.022</pub-id><pub-id pub-id-type="pmid">23428379</pub-id></citation>
</ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bexfield</surname> <given-names>N.</given-names></name> <name><surname>Kellam</surname> <given-names>P.</given-names></name></person-group> (<year>2011</year>). <article-title>Metagenomics and the molecular identification of novel viruses</article-title>. <source>Vet. J.</source> <volume>190</volume>, <fpage>191</fpage>&#x02013;<lpage>198</lpage>. <pub-id pub-id-type="doi">10.1016/j.tvjl.2010.10.014</pub-id><pub-id pub-id-type="pmid">21111643</pub-id></citation>
</ref>
<ref id="B4">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bibby</surname> <given-names>K.</given-names></name> <name><surname>Peccia</surname> <given-names>J.</given-names></name></person-group> (<year>2013</year>). <article-title>Identification of viral pathogen diversity in sewage sludge by metagenome analysis</article-title>. <source>Environ. Sci. Technol.</source> <volume>47</volume>, <fpage>1945</fpage>&#x02013;<lpage>1951</lpage>. <pub-id pub-id-type="doi">10.1021/es305181x</pub-id><pub-id pub-id-type="pmid">23346855</pub-id></citation>
</ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Breitbart</surname> <given-names>M.</given-names></name> <name><surname>Salamon</surname> <given-names>P.</given-names></name> <name><surname>Andresen</surname> <given-names>B.</given-names></name> <name><surname>Mahaffy</surname> <given-names>J. M.</given-names></name> <name><surname>Segall</surname> <given-names>A. M.</given-names></name> <name><surname>Mead</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2002</year>). <article-title>Genomic analysis of uncultured marine viral communities</article-title>. <source>Proc. Natl. Acad. Sci. U.S.A.</source> <volume>99</volume>, <fpage>14250</fpage>&#x02013;<lpage>14255</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.202488399</pub-id><pub-id pub-id-type="pmid">12384570</pub-id></citation>
</ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Brenner</surname> <given-names>S. E.</given-names></name> <name><surname>Chothia</surname> <given-names>C.</given-names></name> <name><surname>Hubbard</surname> <given-names>T. J.</given-names></name></person-group> (<year>1998</year>). <article-title>Assessing sequence comparison methods with reliable structurally identified distant evolutionary relationships</article-title>. <source>Proc. Natl. Acad. Sci. U.S.A.</source> <volume>95</volume>, <fpage>6073</fpage>&#x02013;<lpage>6078</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.95.11.6073</pub-id><pub-id pub-id-type="pmid">9600919</pub-id></citation>
</ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Camacho</surname> <given-names>C.</given-names></name> <name><surname>Coulouris</surname> <given-names>G.</given-names></name> <name><surname>Avagyan</surname> <given-names>V.</given-names></name> <name><surname>Ma</surname> <given-names>N.</given-names></name> <name><surname>Papadopoulos</surname> <given-names>J.</given-names></name> <name><surname>Bealer</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2009</year>). <article-title>BLAST&#x0002B;: architecture and applications</article-title>. <source>BMC Bioinformatics</source> <volume>10</volume>:<fpage>421</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-10-421</pub-id><pub-id pub-id-type="pmid">20003500</pub-id></citation>
</ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cantalupo</surname> <given-names>P. G.</given-names></name> <name><surname>Calgua</surname> <given-names>B.</given-names></name> <name><surname>Zhao</surname> <given-names>G.</given-names></name> <name><surname>Hundesa</surname> <given-names>A.</given-names></name> <name><surname>Wier</surname> <given-names>A. D.</given-names></name> <name><surname>Katz</surname> <given-names>J. P.</given-names></name> <etal/></person-group>. (<year>2011</year>). <article-title>Raw sewage harbors diverse viral populations</article-title>. <source>mBio</source> <volume>2</volume>:<fpage>e00180</fpage>&#x02013;<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1128/mBio.00180-11</pub-id><pub-id pub-id-type="pmid">21972239</pub-id></citation>
</ref>
<ref id="B9">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Darriba</surname> <given-names>D.</given-names></name> <name><surname>Taboada</surname> <given-names>G. L.</given-names></name> <name><surname>Doallo</surname> <given-names>R.</given-names></name> <name><surname>Posada</surname> <given-names>D.</given-names></name></person-group> (<year>2011</year>). <article-title>ProtTest 3: fast selection of best-fit models of protein evolution</article-title>. <source>Bioinformatics</source> <volume>27</volume>, <fpage>1164</fpage>&#x02013;<lpage>1165</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btr088</pub-id><pub-id pub-id-type="pmid">21335321</pub-id></citation>
</ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>de Andrade Zanotto</surname> <given-names>P. M.</given-names></name> <name><surname>Krakauer</surname> <given-names>D. C.</given-names></name></person-group> (<year>2008</year>). <article-title>Complete genome viral phylogenies suggests the concerted evolution of regulatory cores and accessory satellites</article-title>. <source>PLoS ONE</source> <volume>3</volume>:<fpage>e3500</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0003500</pub-id><pub-id pub-id-type="pmid">18941535</pub-id></citation>
</ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Delcher</surname> <given-names>A. L.</given-names></name> <name><surname>Bratke</surname> <given-names>K. A.</given-names></name> <name><surname>Powers</surname> <given-names>E. C.</given-names></name> <name><surname>Salzberg</surname> <given-names>S. L.</given-names></name></person-group> (<year>2007</year>). <article-title>Identifying bacterial genes and endosymbiont DNA with Glimmer</article-title>. <source>Bioinformatics</source> <volume>23</volume>, <fpage>673</fpage>&#x02013;<lpage>679</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btm009</pub-id><pub-id pub-id-type="pmid">17237039</pub-id></citation>
</ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Deng</surname> <given-names>X.</given-names></name> <name><surname>Cheng</surname> <given-names>J.</given-names></name></person-group> (<year>2014</year>). <article-title>Enhancing HMM-based protein profile-profile alignment with structural features and evolutionary coupling information</article-title>. <source>BMC Bioinformatics</source> <volume>15</volume>:<fpage>252</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-15-252</pub-id><pub-id pub-id-type="pmid">25062980</pub-id></citation>
</ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Durham</surname> <given-names>A. M.</given-names></name> <name><surname>Kashiwabara</surname> <given-names>A. Y.</given-names></name> <name><surname>Matsunaga</surname> <given-names>F. T.</given-names></name> <name><surname>Ahagon</surname> <given-names>P. H.</given-names></name> <name><surname>Rainone</surname> <given-names>F.</given-names></name> <name><surname>Varuzza</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2005</year>). <article-title>EGene: a configurable pipeline generation system for automated sequence analysis</article-title>. <source>Bioinformatics</source> <volume>21</volume>, <fpage>2812</fpage>&#x02013;<lpage>2813</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bti424</pub-id><pub-id pub-id-type="pmid">15814554</pub-id></citation>
</ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dutilh</surname> <given-names>B. E.</given-names></name> <name><surname>Cassman</surname> <given-names>N.</given-names></name> <name><surname>McNair</surname> <given-names>K.</given-names></name> <name><surname>Sanchez</surname> <given-names>S. E.</given-names></name> <name><surname>Silva</surname> <given-names>G. G.</given-names></name> <name><surname>Boling</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>A highly abundant bacteriophage discovered in the unknown sequences of human faecal metagenomes</article-title>. <source>Nat. Commun.</source> <volume>5</volume>, <fpage>4498</fpage>. <pub-id pub-id-type="doi">10.1038/ncomms5498</pub-id><pub-id pub-id-type="pmid">25058116</pub-id></citation>
</ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Eddy</surname> <given-names>S. R.</given-names></name></person-group> (<year>2011</year>). <article-title>Accelerated profile HMM searches</article-title>. <source>PLoS Comput. Biol.</source> <volume>7</volume>:<fpage>e1002195</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1002195</pub-id><pub-id pub-id-type="pmid">22039361</pub-id></citation>
</ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Edgar</surname> <given-names>R. C.</given-names></name></person-group> (<year>2004</year>). <article-title>MUSCLE: multiple sequence alignment with high accuracy and high throughput</article-title>. <source>Nucleic Acids Res.</source> <volume>32</volume>, <fpage>1792</fpage>&#x02013;<lpage>1797</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkh340</pub-id><pub-id pub-id-type="pmid">15034147</pub-id></citation>
</ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>El-Metwally</surname> <given-names>S.</given-names></name> <name><surname>Hamza</surname> <given-names>T.</given-names></name> <name><surname>Zakaria</surname> <given-names>M.</given-names></name> <name><surname>Helmy</surname> <given-names>M.</given-names></name></person-group> (<year>2013</year>). <article-title>Next-generation sequence assembly: four stages of data processing and computational challenges</article-title>. <source>PLoS Comput. Biol.</source> <volume>9</volume>:<fpage>e1003345</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1003345</pub-id><pub-id pub-id-type="pmid">24348224</pub-id></citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fancello</surname> <given-names>L.</given-names></name> <name><surname>Raoult</surname> <given-names>D.</given-names></name> <name><surname>Desnues</surname> <given-names>C.</given-names></name></person-group> (<year>2012</year>). <article-title>Computational tools for viral metagenomics and their application in clinical research</article-title>. <source>Virology</source> <volume>434</volume>, <fpage>162</fpage>&#x02013;<lpage>174</lpage>. <pub-id pub-id-type="doi">10.1016/j.virol.2012.09.025</pub-id><pub-id pub-id-type="pmid">23062738</pub-id></citation>
</ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>X.</given-names></name> <name><surname>Madan</surname> <given-names>A.</given-names></name></person-group> (<year>1999</year>). <article-title>CAP3: a DNA sequence assembly program</article-title>. <source>Genome Res.</source> <volume>9</volume>, <fpage>868</fpage>&#x02013;<lpage>877</lpage>. <pub-id pub-id-type="doi">10.1101/gr.9.9.868</pub-id><pub-id pub-id-type="pmid">10508846</pub-id></citation>
</ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kim</surname> <given-names>M. S.</given-names></name> <name><surname>Park</surname> <given-names>E. J.</given-names></name> <name><surname>Roh</surname> <given-names>S. W.</given-names></name> <name><surname>Bae</surname> <given-names>J. W.</given-names></name></person-group> (<year>2011</year>). <article-title>Diversity and abundance of single-stranded DNA viruses in human feces</article-title>. <source>Appl. Environ. Microbiol.</source> <volume>77</volume>, <fpage>8062</fpage>&#x02013;<lpage>8070</lpage>. <pub-id pub-id-type="doi">10.1128/AEM.06331-11</pub-id><pub-id pub-id-type="pmid">21948823</pub-id></citation>
</ref>
<ref id="B21">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Krakauer</surname> <given-names>D. C.</given-names></name> <name><surname>Zanotto</surname> <given-names>P.</given-names></name></person-group> (<year>2008</year>). <article-title>Viral individuality and limitations of the life concept</article-title>, in <source>Protocells: Bridging Nonliving and Living Matter</source>, eds <person-group person-group-type="editor"><name><surname>Rasmussen</surname> <given-names>S.</given-names></name> <name><surname>Bedau</surname> <given-names>M. A.</given-names></name> <name><surname>Chen</surname> <given-names>L.</given-names></name> <name><surname>Deamer</surname> <given-names>D.</given-names></name> <name><surname>Krakauer</surname> <given-names>D. C.</given-names></name> <name><surname>Packard</surname> <given-names>N. H.</given-names></name> <name><surname>Stadler</surname> <given-names>P. F.</given-names></name></person-group> (<publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>MIT Press</publisher-name> Scholarship Online), <fpage>513</fpage>&#x02013;<lpage>536</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Krupovic</surname> <given-names>M.</given-names></name> <name><surname>Forterre</surname> <given-names>P.</given-names></name></person-group> (<year>2011</year>). <article-title>Microviridae goes temperate: microvirus-related proviruses reside in the genomes of Bacteroidetes</article-title>. <source>PLoS ONE</source> <volume>6</volume>:<fpage>e19893</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0019893</pub-id><pub-id pub-id-type="pmid">21572966</pub-id></citation>
</ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Langmead</surname> <given-names>B.</given-names></name> <name><surname>Salzberg</surname> <given-names>S. L.</given-names></name></person-group> (<year>2012</year>). <article-title>Fast gapped-read alignment with Bowtie 2</article-title>. <source>Nat. Methods</source> <volume>9</volume>, <fpage>357</fpage>&#x02013;<lpage>359</lpage>. <pub-id pub-id-type="doi">10.1038/nmeth.1923</pub-id><pub-id pub-id-type="pmid">22388286</pub-id></citation>
</ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lipkin</surname> <given-names>W. I.</given-names></name> <name><surname>Firth</surname> <given-names>C.</given-names></name></person-group> (<year>2013</year>). <article-title>Viral surveillance and discovery</article-title>. <source>Curr. Opin. Virol.</source> <volume>3</volume>, <fpage>199</fpage>&#x02013;<lpage>204</lpage>. <pub-id pub-id-type="doi">10.1016/j.coviro.2013.03.010</pub-id><pub-id pub-id-type="pmid">23602435</pub-id></citation>
</ref>
<ref id="B25">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Luo</surname> <given-names>R.</given-names></name> <name><surname>Liu</surname> <given-names>B.</given-names></name> <name><surname>Xie</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name> <name><surname>Huang</surname> <given-names>W.</given-names></name> <name><surname>Yuan</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2012</year>). <article-title>SOAPdenovo2: an empirically improved memory-efficient short-read <italic>de novo</italic> assembler</article-title>. <source>Gigascience</source> <volume>1</volume>:<fpage>18</fpage>. <pub-id pub-id-type="doi">10.1186/2047-217X-1-18</pub-id><pub-id pub-id-type="pmid">23587118</pub-id></citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mayer</surname> <given-names>E. A.</given-names></name> <name><surname>Knight</surname> <given-names>R.</given-names></name> <name><surname>Mazmanian</surname> <given-names>S. K.</given-names></name> <name><surname>Cryan</surname> <given-names>J. F.</given-names></name> <name><surname>Tillisch</surname> <given-names>K.</given-names></name></person-group> (<year>2014</year>). <article-title>Gut microbes and the brain: paradigm shift in neuroscience</article-title>. <source>J. Neurosci.</source> <volume>34</volume>, <fpage>15490</fpage>&#x02013;<lpage>15496</lpage>. <pub-id pub-id-type="doi">10.1523/JNEUROSCI.3299-14.2014</pub-id><pub-id pub-id-type="pmid">25392516</pub-id></citation>
</ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mehnert</surname> <given-names>D. U.</given-names></name> <name><surname>Stewien</surname> <given-names>K. E.</given-names></name></person-group> (<year>1993</year>). <article-title>Detection and distribution of rotavirus in raw sewage and creeks in Sao Paulo, Brazil</article-title>. <source>Appl. Environ. Microbiol.</source> <volume>59</volume>, <fpage>140</fpage>&#x02013;<lpage>143</lpage>. <pub-id pub-id-type="pmid">8382461</pub-id></citation>
</ref>
<ref id="B28">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Milne</surname> <given-names>I.</given-names></name> <name><surname>Stephen</surname> <given-names>G.</given-names></name> <name><surname>Bayer</surname> <given-names>M.</given-names></name> <name><surname>Cock</surname> <given-names>P. J.</given-names></name> <name><surname>Pritchard</surname> <given-names>L.</given-names></name> <name><surname>Cardle</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2013</year>). <article-title>Using Tablet for visual exploration of second-generation sequencing data</article-title>. <source>Brief. Bioinformatics</source> <volume>14</volume>, <fpage>193</fpage>&#x02013;<lpage>202</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbs012</pub-id><pub-id pub-id-type="pmid">22445902</pub-id></citation>
</ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mitchell</surname> <given-names>A.</given-names></name> <name><surname>Chang</surname> <given-names>H. Y.</given-names></name> <name><surname>Daugherty</surname> <given-names>L.</given-names></name> <name><surname>Fraser</surname> <given-names>M.</given-names></name> <name><surname>Hunter</surname> <given-names>S.</given-names></name> <name><surname>Lopez</surname> <given-names>R.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>The InterPro protein families database: the classification resource after 15 years</article-title>. <source>Nucleic Acids Res.</source> <volume>43</volume>, <fpage>D213</fpage>&#x02013;<lpage>D221</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gku1243</pub-id><pub-id pub-id-type="pmid">25428371</pub-id></citation>
</ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Norman</surname> <given-names>J. M.</given-names></name> <name><surname>Handley</surname> <given-names>S. A.</given-names></name> <name><surname>Baldridge</surname> <given-names>M. T.</given-names></name> <name><surname>Droit</surname> <given-names>L.</given-names></name> <name><surname>Liu</surname> <given-names>C. Y.</given-names></name> <name><surname>Keller</surname> <given-names>B. C.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Disease-specific alterations in the enteric virome in inflammatory bowel disease</article-title>. <source>Cell</source> <volume>160</volume>, <fpage>447</fpage>&#x02013;<lpage>460</lpage>. <pub-id pub-id-type="doi">10.1016/j.cell.2015.01.002</pub-id><pub-id pub-id-type="pmid">25619688</pub-id></citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Palacios</surname> <given-names>G.</given-names></name> <name><surname>Druce</surname> <given-names>J.</given-names></name> <name><surname>Du</surname> <given-names>L.</given-names></name> <name><surname>Tran</surname> <given-names>T.</given-names></name> <name><surname>Birch</surname> <given-names>C.</given-names></name> <name><surname>Briese</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2008</year>). <article-title>A new arenavirus in a cluster of fatal transplant-associated diseases</article-title>. <source>N. Engl. J. Med.</source> <volume>358</volume>, <fpage>991</fpage>&#x02013;<lpage>998</lpage>. <pub-id pub-id-type="doi">10.1056/NEJMoa073785</pub-id><pub-id pub-id-type="pmid">18256387</pub-id></citation>
</ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Park</surname> <given-names>J.</given-names></name> <name><surname>Karplus</surname> <given-names>K.</given-names></name> <name><surname>Barrett</surname> <given-names>C.</given-names></name> <name><surname>Hughey</surname> <given-names>R.</given-names></name> <name><surname>Haussler</surname> <given-names>D.</given-names></name> <name><surname>Hubbard</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>1998</year>). <article-title>Sequence comparisons using multiple sequences detect three times as many remote homologues as pairwise methods</article-title>. <source>J. Mol. Biol.</source> <volume>284</volume>, <fpage>1201</fpage>&#x02013;<lpage>1210</lpage>. <pub-id pub-id-type="doi">10.1006/jmbi.1998.2221</pub-id><pub-id pub-id-type="pmid">9837738</pub-id></citation>
</ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Phan</surname> <given-names>T. G.</given-names></name> <name><surname>Mori</surname> <given-names>D.</given-names></name> <name><surname>Deng</surname> <given-names>X.</given-names></name> <name><surname>Rajindrajith</surname> <given-names>S.</given-names></name> <name><surname>Ranawaka</surname> <given-names>U.</given-names></name> <name><surname>Fan Ng</surname> <given-names>T. F.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Small circular single stranded DNA viral genomes in unexplained cases of human encephalitis, diarrhea, and in untreated sewage</article-title>. <source>Virology</source> <volume>482</volume>, <fpage>98</fpage>&#x02013;<lpage>104</lpage>. <pub-id pub-id-type="doi">10.1016/j.virol.2015.03.011</pub-id><pub-id pub-id-type="pmid">25839169</pub-id></citation>
</ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Quaiser</surname> <given-names>A.</given-names></name> <name><surname>Dufresne</surname> <given-names>A.</given-names></name> <name><surname>Ballaud</surname> <given-names>F.</given-names></name> <name><surname>Roux</surname> <given-names>S.</given-names></name> <name><surname>Zivanovic</surname> <given-names>Y.</given-names></name> <name><surname>Colombet</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Diversity and comparative genomics of Microviridae in Sphagnum- dominated peatlands</article-title>. <source>Front. Microbiol.</source> <volume>6</volume>:<issue>375</issue>. <pub-id pub-id-type="doi">10.3389/fmicb.2015.00375</pub-id><pub-id pub-id-type="pmid">25972855</pub-id></citation>
</ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Queiroz</surname> <given-names>A. P.</given-names></name> <name><surname>Santos</surname> <given-names>F. M.</given-names></name> <name><surname>Sassaroli</surname> <given-names>A.</given-names></name> <name><surname>H&#x000E1;rsi</surname> <given-names>C. M.</given-names></name> <name><surname>Monezi</surname> <given-names>T. A.</given-names></name> <name><surname>Mehnert</surname> <given-names>D. U.</given-names></name></person-group> (<year>2001</year>). <article-title>Electropositive filter membrane as an alternative for the elimination of PCR inhibitors from sewage and water samples</article-title>. <source>Appl. Environ. Microbiol.</source> <volume>67</volume>, <fpage>4614</fpage>&#x02013;<lpage>4618</lpage>. <pub-id pub-id-type="doi">10.1128/AEM.67.10.4614-4618.2001</pub-id><pub-id pub-id-type="pmid">11571164</pub-id></citation>
</ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Reyes</surname> <given-names>A.</given-names></name> <name><surname>Blanton</surname> <given-names>L. V.</given-names></name> <name><surname>Cao</surname> <given-names>S.</given-names></name> <name><surname>Zhao</surname> <given-names>G.</given-names></name> <name><surname>Manary</surname> <given-names>M.</given-names></name> <name><surname>Trehan</surname> <given-names>I.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Gut DNA viromes of Malawian twins discordant for severe acute malnutrition</article-title>. <source>Proc. Natl. Acad. Sci. U.S.A.</source> <volume>112</volume>, <fpage>11941</fpage>&#x02013;<lpage>11946</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1514285112</pub-id><pub-id pub-id-type="pmid">26351661</pub-id></citation>
</ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Reyes</surname> <given-names>A.</given-names></name> <name><surname>Haynes</surname> <given-names>M.</given-names></name> <name><surname>Hanson</surname> <given-names>N.</given-names></name> <name><surname>Angly</surname> <given-names>F. E.</given-names></name> <name><surname>Heath</surname> <given-names>A. C.</given-names></name> <name><surname>Rohwer</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2010</year>). <article-title>Viruses in the faecal microbiota of monozygotic twins and their mothers</article-title>. <source>Nature</source> <volume>466</volume>, <fpage>334</fpage>&#x02013;<lpage>338</lpage>. <pub-id pub-id-type="doi">10.1038/nature09199</pub-id><pub-id pub-id-type="pmid">20631792</pub-id></citation>
</ref>
<ref id="B38">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Reyes</surname> <given-names>A.</given-names></name> <name><surname>Semenkovich</surname> <given-names>N. P.</given-names></name> <name><surname>Whiteson</surname> <given-names>K.</given-names></name> <name><surname>Rohwer</surname> <given-names>F.</given-names></name> <name><surname>Gordon</surname> <given-names>J. I.</given-names></name></person-group> (<year>2012</year>). <article-title>Going viral: next-generation sequencing applied to phage populations in the human gut</article-title>. <source>Nat. Rev. Microbiol.</source> <volume>10</volume>, <fpage>607</fpage>&#x02013;<lpage>617</lpage>. <pub-id pub-id-type="doi">10.1038/nrmicro2853</pub-id><pub-id pub-id-type="pmid">22864264</pub-id></citation>
</ref>
<ref id="B39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rice</surname> <given-names>P.</given-names></name> <name><surname>Longden</surname> <given-names>I.</given-names></name> <name><surname>Bleasby</surname> <given-names>A.</given-names></name></person-group> (<year>2000</year>). <article-title>EMBOSS: the European Molecular Biology Open Software Suite</article-title>. <source>Trends Genet.</source> <volume>16</volume>, <fpage>276</fpage>&#x02013;<lpage>277</lpage>. <pub-id pub-id-type="doi">10.1016/S0168-9525(00)02024-2</pub-id><pub-id pub-id-type="pmid">10827456</pub-id></citation>
</ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rohwer</surname> <given-names>F.</given-names></name> <name><surname>Thurber</surname> <given-names>R. V.</given-names></name></person-group> (<year>2009</year>). <article-title>Viruses manipulate the marine environment</article-title>. <source>Nature</source> <volume>459</volume>, <fpage>207</fpage>&#x02013;<lpage>212</lpage>. <pub-id pub-id-type="doi">10.1038/nature08060</pub-id><pub-id pub-id-type="pmid">19444207</pub-id></citation>
</ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rosenberg</surname> <given-names>R.</given-names></name></person-group> (<year>2015</year>). <article-title>Detecting the emergence of novel, zoonotic viruses pathogenic to humans</article-title>. <source>Cell. Mol. Life Sci.</source> <volume>72</volume>, <fpage>1115</fpage>&#x02013;<lpage>1125</lpage>. <pub-id pub-id-type="doi">10.1007/s00018-014-1785-y</pub-id><pub-id pub-id-type="pmid">25416679</pub-id></citation>
</ref>
<ref id="B42">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Roux</surname> <given-names>S.</given-names></name> <name><surname>Krupovic</surname> <given-names>M.</given-names></name> <name><surname>Poulet</surname> <given-names>A.</given-names></name> <name><surname>Debroas</surname> <given-names>D.</given-names></name> <name><surname>Enault</surname> <given-names>F.</given-names></name></person-group> (<year>2012</year>). <article-title>Evolution and diversity of the Microviridae viral family through a collection of 81 new complete genomes assembled from virome reads</article-title>. <source>PLoS ONE</source> <volume>7</volume>:<fpage>e40418</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0040418</pub-id><pub-id pub-id-type="pmid">22808158</pub-id></citation>
</ref>
<ref id="B43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ruby</surname> <given-names>J. G.</given-names></name> <name><surname>Bellare</surname> <given-names>P.</given-names></name> <name><surname>Derisi</surname> <given-names>J. L.</given-names></name></person-group> (<year>2013</year>). <article-title>PRICE: software for the targeted assembly of components of (Meta) genomic sequence data</article-title>. <source>G3 (Bethesda)</source> <volume>3</volume>, <fpage>865</fpage>&#x02013;<lpage>880</lpage>. <pub-id pub-id-type="doi">10.1534/g3.113.005967</pub-id><pub-id pub-id-type="pmid">23550143</pub-id></citation>
</ref>
<ref id="B44">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Siebrasse</surname> <given-names>E. A.</given-names></name> <name><surname>Reyes</surname> <given-names>A.</given-names></name> <name><surname>Lim</surname> <given-names>E. S.</given-names></name> <name><surname>Zhao</surname> <given-names>G.</given-names></name> <name><surname>Mkakosya</surname> <given-names>R. S.</given-names></name> <name><surname>Manary</surname> <given-names>M. J.</given-names></name> <etal/></person-group>. (<year>2012</year>). <article-title>Identification of MW polyomavirus, a novel polyomavirus in human stool</article-title>. <source>J. Virol.</source> <volume>86</volume>, <fpage>10321</fpage>&#x02013;<lpage>10326</lpage>. <pub-id pub-id-type="doi">10.1128/JVI.01210-12</pub-id><pub-id pub-id-type="pmid">22740408</pub-id></citation>
</ref>
<ref id="B45">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Simpson</surname> <given-names>J. T.</given-names></name> <name><surname>Wong</surname> <given-names>K.</given-names></name> <name><surname>Jackman</surname> <given-names>S. D.</given-names></name> <name><surname>Schein</surname> <given-names>J. E.</given-names></name> <name><surname>Jones</surname> <given-names>S. J.</given-names></name> <name><surname>Birol</surname> <given-names>I.</given-names></name></person-group> (<year>2009</year>). <article-title>ABySS: a parallel assembler for short read sequence data</article-title>. <source>Genome Res.</source> <volume>19</volume>, <fpage>1117</fpage>&#x02013;<lpage>1123</lpage>. <pub-id pub-id-type="doi">10.1101/gr.089532.108</pub-id><pub-id pub-id-type="pmid">19251739</pub-id></citation>
</ref>
<ref id="B46">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Skewes-Cox</surname> <given-names>P.</given-names></name> <name><surname>Sharpton</surname> <given-names>T. J.</given-names></name> <name><surname>Pollard</surname> <given-names>K. S.</given-names></name> <name><surname>DeRisi</surname> <given-names>J. L.</given-names></name></person-group> (<year>2014</year>). <article-title>Profile hidden Markov models for the detection of viruses within metagenomic sequence data</article-title>. <source>PLoS ONE</source> <volume>9</volume>:<fpage>e105067</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0105067</pub-id><pub-id pub-id-type="pmid">25140992</pub-id></citation>
</ref>
<ref id="B47">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Smits</surname> <given-names>S. L.</given-names></name> <name><surname>Bodewes</surname> <given-names>R.</given-names></name> <name><surname>Ruiz-Gonz&#x000E1;lez</surname> <given-names>A.</given-names></name> <name><surname>Baumg&#x000E4;rtner</surname> <given-names>W.</given-names></name> <name><surname>Koopmans</surname> <given-names>M. P.</given-names></name> <name><surname>Osterhaus</surname> <given-names>A. D.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Recovering full-length viral genomes from metagenomes</article-title>. <source>Front. Microbiol.</source> <volume>6</volume>:<issue>1069</issue>. <pub-id pub-id-type="doi">10.3389/fmicb.2015.01069</pub-id><pub-id pub-id-type="pmid">26483782</pub-id></citation>
</ref>
<ref id="B48">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Smits</surname> <given-names>S. L.</given-names></name> <name><surname>Osterhaus</surname> <given-names>A. D.</given-names></name></person-group> (<year>2013</year>). <article-title>Virus discovery: one step beyond</article-title>. <source>Curr. Opin. Virol.</source> <volume>3</volume>, <fpage>e1</fpage>&#x02013;<lpage>e6</lpage>. <pub-id pub-id-type="doi">10.1016/j.coviro.2013.03.007</pub-id></citation>
</ref>
<ref id="B49">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Sobreira</surname> <given-names>T. J.</given-names></name> <name><surname>Gruber</surname> <given-names>A.</given-names></name></person-group> (<year>2008</year>). <article-title>Sequence-specific reconstruction from fragmentary databases using seed sequences: implementation and validation on SAGE, proteome and generic sequencing data</article-title>. <source>Bioinformatics</source> <volume>24</volume>, <fpage>1676</fpage>&#x02013;<lpage>1680</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btn283</pub-id><pub-id pub-id-type="pmid">18544546</pub-id></citation>
</ref>
<ref id="B50">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stamatakis</surname> <given-names>A.</given-names></name></person-group> (<year>2006</year>). <article-title>RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models</article-title>. <source>Bioinformatics</source> <volume>22</volume>, <fpage>2688</fpage>&#x02013;<lpage>2690</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btl446</pub-id><pub-id pub-id-type="pmid">16928733</pub-id></citation>
</ref>
<ref id="B51">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>P.</given-names></name> <name><surname>Chiu</surname> <given-names>C.</given-names></name></person-group> (<year>2010</year>). <article-title>Metagenomics for the discovery of novel human viruses</article-title>. <source>Future Microbiol.</source> <volume>5</volume>, <fpage>177</fpage>&#x02013;<lpage>189</lpage>. <pub-id pub-id-type="doi">10.2217/fmb.09.120</pub-id><pub-id pub-id-type="pmid">20143943</pub-id></citation>
</ref>
<ref id="B52">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wajid</surname> <given-names>B.</given-names></name> <name><surname>Serpedin</surname> <given-names>E.</given-names></name></person-group> (<year>2012</year>). <article-title>Review of general algorithmic features for genome assemblers for next generation sequencers</article-title>. <source>Genomics Proteomics Bioinformatics</source> <volume>10</volume>, <fpage>58</fpage>&#x02013;<lpage>73</lpage>. <pub-id pub-id-type="doi">10.1016/j.gpb.2012.05.006</pub-id><pub-id pub-id-type="pmid">22768980</pub-id></citation>
</ref>
<ref id="B53">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>L. F.</given-names></name></person-group> (<year>2011</year>). <article-title>Discovering novel zoonotic viruses</article-title>. <source>N. S. W. Public Health Bull.</source> <volume>22</volume>, <fpage>113</fpage>&#x02013;<lpage>117</lpage>. <pub-id pub-id-type="doi">10.1071/NB10078</pub-id><pub-id pub-id-type="pmid">21781618</pub-id></citation>
</ref>
<ref id="B54">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Waterhouse</surname> <given-names>A. M.</given-names></name> <name><surname>Procter</surname> <given-names>J. B.</given-names></name> <name><surname>Martin</surname> <given-names>D. M.</given-names></name> <name><surname>Clamp</surname> <given-names>M.</given-names></name> <name><surname>Barton</surname> <given-names>G. J.</given-names></name></person-group> (<year>2009</year>). <article-title>Jalview Version 2&#x02013;a multiple sequence alignment editor and analysis workbench</article-title>. <source>Bioinformatics</source> <volume>25</volume>, <fpage>1189</fpage>&#x02013;<lpage>1191</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btp033</pub-id><pub-id pub-id-type="pmid">19151095</pub-id></citation>
</ref>
<ref id="B55">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zerbino</surname> <given-names>D. R.</given-names></name> <name><surname>Birney</surname> <given-names>E.</given-names></name></person-group> (<year>2008</year>). <article-title>Velvet: algorithms for <italic>de novo</italic> short read assembly using de Bruijn graphs</article-title>. <source>Genome Res.</source> <volume>18</volume>, <fpage>821</fpage>&#x02013;<lpage>829</lpage>. <pub-id pub-id-type="doi">10.1101/gr.074492.107</pub-id><pub-id pub-id-type="pmid">18349386</pub-id></citation>
</ref>
<ref id="B56">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Sun</surname> <given-names>Y.</given-names></name> <name><surname>Cole</surname> <given-names>J. R.</given-names></name></person-group> (<year>2014</year>). <article-title>A scalable and accurate targeted gene assembly tool (SAT-Assembler) for next-generation sequencing data</article-title>. <source>PLoS Comput. Biol.</source> <volume>10</volume>:<fpage>e1003737</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1003737</pub-id><pub-id pub-id-type="pmid">25122209</pub-id></citation>
</ref>
<ref id="B57">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhong</surname> <given-names>X.</given-names></name> <name><surname>Guidoni</surname> <given-names>B.</given-names></name> <name><surname>Jacas</surname> <given-names>L.</given-names></name> <name><surname>Jacquet</surname> <given-names>S.</given-names></name></person-group> (<year>2015</year>). <article-title>Structure and diversity of ssDNA Microviridae viruses in two peri-alpine lakes (Annecy and Bourget, France)</article-title>. <source>Res. Microbiol.</source> <volume>166</volume>, <fpage>644</fpage>&#x02013;<lpage>654</lpage>. <pub-id pub-id-type="doi">10.1016/j.resmic.2015.07.003</pub-id><pub-id pub-id-type="pmid">26226335</pub-id></citation>
</ref>
</ref-list>
</back>
</article>