<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Microbiol.</journal-id>
<journal-title>Frontiers in Microbiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Microbiol.</abbrev-journal-title>
<issn pub-type="epub">1664-302X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmicb.2023.1217750</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Microbiology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Predicting environmental stressor levels with machine learning: a comparison between amplicon sequencing, metagenomics, and total RNA sequencing based on taxonomically assigned data</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Hempel</surname>
<given-names>Christopher A.</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/803421/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Buchner</surname>
<given-names>Dominik</given-names>
</name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Mack</surname>
<given-names>Leoni</given-names>
</name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/953139/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Brasseur</surname>
<given-names>Marie V.</given-names>
</name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Tulpan</surname>
<given-names>Dan</given-names>
</name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<xref ref-type="aff" rid="aff7"><sup>7</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1122614/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Leese</surname>
<given-names>Florian</given-names>
</name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff8"><sup>8</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/103136/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Steinke</surname>
<given-names>Dirk</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/122721/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Integrative Biology, University of Guelph</institution>, <addr-line>Guelph, ON</addr-line>, <country>Canada</country></aff>
<aff id="aff2"><sup>2</sup><institution>Centre for Biodiversity Genomics, University of Guelph</institution>, <addr-line>Guelph, ON</addr-line>, <country>Canada</country></aff>
<aff id="aff3"><sup>3</sup><institution>Aquatic Ecosystem Research, University of Duisburg-Essen</institution>, <addr-line>Essen</addr-line>, <country>Germany</country></aff>
<aff id="aff4"><sup>4</sup><institution>Faculty of Aquatic Ecology, University of Duisburg-Essen</institution>, <addr-line>Essen</addr-line>, <country>Germany</country></aff>
<aff id="aff5"><sup>5</sup><institution>Leibniz Institute for the Analysis of Biodiversity Change, Zoological Research Museum A. Koenig</institution>, <addr-line>Bonn</addr-line>, <country>Germany</country></aff>
<aff id="aff6"><sup>6</sup><institution>School of Computer Science, University of Guelph</institution>, <addr-line>Guelph, ON</addr-line>, <country>Canada</country></aff>
<aff id="aff7"><sup>7</sup><institution>Department of Animal Biosciences, University of Guelph</institution>, <addr-line>Guelph, ON</addr-line>, <country>Canada</country></aff>
<aff id="aff8"><sup>8</sup><institution>Centre for Water and Environmental Research (ZWU), University of Duisburg-Essen</institution>, <addr-line>Essen</addr-line>, <country>Germany</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0010">
<p>Edited by: Cristina Garc&#x00ED;a-Aljaro, University of Barcelona, Spain</p>
</fn>
<fn fn-type="edited-by" id="fn0011">
<p>Reviewed by: Drishti Kaul, J. Craig Venter Institute (La Jolla), United States; Craig Lee Moyer, Western Washington University, United States; Anders Lanz&#x00E9;n, Technology Center Expert in Marine and Food Innovation (AZTI), Spain</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Christopher A. Hempel, <email>chempel.work@gmail.com</email></corresp>
<corresp id="c002">Dirk Steinke, <email>dsteinke@uoguelph.ca</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>24</day>
<month>11</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1217750</elocation-id>
<history>
<date date-type="received">
<day>05</day>
<month>05</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>10</day>
<month>11</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2023 Hempel, Buchner, Mack, Brasseur, Tulpan, Leese and Steinke.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Hempel, Buchner, Mack, Brasseur, Tulpan, Leese and Steinke</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Microbes are increasingly (re)considered for environmental assessments because they are powerful indicators for the health of ecosystems. The complexity of microbial communities necessitates powerful novel tools to derive conclusions for environmental decision-makers, and machine learning is a promising option in that context. While amplicon sequencing is typically applied to assess microbial communities, metagenomics and total RNA sequencing (herein summarized as omics-based methods) can provide a more holistic picture of microbial biodiversity at sufficient sequencing depths. Despite this advantage, amplicon sequencing and omics-based methods have not yet been compared for taxonomy-based environmental assessments with machine learning.</p>
</sec>
<sec>
<title>Methods</title>
<p>In this study, we applied 16S and ITS-2 sequencing, metagenomics, and total RNA sequencing to samples from a stream mesocosm experiment that investigated the impacts of two aquatic stressors, insecticide and increased fine sediment deposition, on stream biodiversity. We processed the data using similarity clustering and denoising (only applicable to amplicon sequencing) as well as multiple taxonomic levels, data types, feature selection, and machine learning algorithms and evaluated the stressor prediction performance of each generated model for a total of 1,536 evaluated combinations of taxonomic datasets and data-processing methods.</p>
</sec>
<sec>
<title>Results</title>
<p>Sequencing and data-processing methods had a substantial impact on stressor prediction. While omics-based methods detected a higher diversity of taxa than amplicon sequencing, 16S sequencing outperformed all other sequencing methods in terms of stressor prediction based on the Matthews Correlation Coefficient. However, even the highest observed performance for 16S sequencing was still only moderate. Omics-based methods performed poorly overall, but this was likely due to insufficient sequencing depth. Data types had no impact on performance while feature selection significantly improved performance for omics-based methods but not for amplicon sequencing.</p>
</sec>
<sec>
<title>Discussion</title>
<p>We conclude that amplicon sequencing might be a better candidate for machine-learning-based environmental stressor prediction than omics-based methods, but the latter require further research at higher sequencing depths to confirm this conclusion. More sampling could improve stressor prediction performance, and while this was not possible in the context of our study, thousands of sampling sites are monitored for routine environmental assessments, providing an ideal framework to further refine the approach for possible implementation in environmental diagnostics.</p>
</sec>
</abstract>
<kwd-group>
<kwd>metabarcoding</kwd>
<kwd>metatranscriptomics</kwd>
<kwd>freshwater</kwd>
<kwd>stressor prediction</kwd>
<kwd>bioinformatics</kwd>
<kwd>ExStream</kwd>
<kwd>mesocosm</kwd>
<kwd>environmental assessment</kwd>
</kwd-group>
<counts>
<fig-count count="5"/>
<table-count count="0"/>
<equation-count count="1"/>
<ref-count count="115"/>
<page-count count="16"/>
<word-count count="14548"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Aquatic Microbiology</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="sec1">
<label>1</label>
<title>Background</title>
<p>Globally, ecosystems are experiencing an unprecedented amount of human-induced environmental stress, caused by climate change, land use, pollution, habitat fragmentation, and the introduction of invasive species. As a consequence, ecosystems are deteriorating and biodiversity is declining faster than ever before in human history (<xref ref-type="bibr" rid="ref26">D&#x00ED;az et al., 2019</xref>; <xref ref-type="bibr" rid="ref111">WWF, 2020</xref>; <xref ref-type="bibr" rid="ref79">Pettorelli et al., 2021</xref>). The loss of biodiversity has extremely negative effects on ecosystem functions and, thereby, ecosystem services, which also reduces the economic value of ecosystems (<xref ref-type="bibr" rid="ref54">Kubiszewski et al., 2017</xref>). As a consequence, environmental management to protect and restore ecosystems has garnered increased attention, also at the political level (<xref ref-type="bibr" rid="ref26">D&#x00ED;az et al., 2019</xref>).</p>
<p>Environmental management includes the identification of prevalent stressors and their impacts on ecosystem health. Microbes (prokaryotes and unicellular eukaryotes) are very good indicators of ecosystem health because they play a crucial role in ecosystems and are extremely sensitive to changes in environmental conditions. Consequently, their community composition can reveal important information about the health and stress levels of ecosystems, which can be utilized for routine biomonitoring to guide measures for the protection and restoration of ecosystems (<xref ref-type="bibr" rid="ref92">Smith et al., 2015</xref>; <xref ref-type="bibr" rid="ref76">Pawlowski et al., 2016</xref>; <xref ref-type="bibr" rid="ref24">Cordier et al., 2019</xref>; <xref ref-type="bibr" rid="ref88">Sagova-Mareckova et al., 2021</xref>). Microbial community composition is usually determined by using amplicon sequencing, which involves target PCR to amplify taxonomic barcode genes (amplicons), typically the 16S ribosomal RNA (rRNA) gene for prokaryotes, the internal transcribed spacer 2 (ITS-2) 2 for fungi, and the 18S rRNA gene for other microbial eukaryotes. Although this approach can introduce taxonomic and abundance bias due to varying binding affinities and amplification efficiencies of target primers (<xref ref-type="bibr" rid="ref82">Pinto and Raskin, 2012</xref>; <xref ref-type="bibr" rid="ref65">Lozupone et al., 2013</xref>; <xref ref-type="bibr" rid="ref105">Walker et al., 2015</xref>; <xref ref-type="bibr" rid="ref70">Meisel et al., 2016</xref>; <xref ref-type="bibr" rid="ref57">Laursen et al., 2017</xref>; <xref ref-type="bibr" rid="ref93">Stat et al., 2017</xref>), it is widely used because it is comparably cheap and can generate valuable and consistent information on community composition.</p>
<p>In contrast, metagenomics and metatranscriptomics are target-PCR-free methods that are usually applied to analyze the presence and expression of functional genes within communities (<xref ref-type="bibr" rid="ref109">Wooley et al., 2010</xref>; <xref ref-type="bibr" rid="ref7">Bashiardes et al., 2016</xref>; <xref ref-type="bibr" rid="ref3">Almeida and De Martinis, 2019</xref>; <xref ref-type="bibr" rid="ref90">Shakya et al., 2019</xref>); however, both methods also generate valuable data that can be used for taxonomic identification of community members as an alternative to amplicon sequencing.</p>
<p>Metagenomics targets all DNA in a sample, including non-functional genes, repetitive regions, and genes containing little taxonomic information due to insufficient variation. A vast number of these genes is lacking reference sequences in databases, and therefore, metagenomics generates large amounts of sequences that cannot be taxonomically annotated. At insufficient sequencing depth, this leads to a low biodiversity coverage that is outperformed by that of amplicon sequencing (<xref ref-type="bibr" rid="ref114">Yilmaz et al., 2011</xref>; <xref ref-type="bibr" rid="ref93">Stat et al., 2017</xref>; <xref ref-type="bibr" rid="ref97">Tessler et al., 2017</xref>). However, this limitation can be overcome by increasing the sequencing depth, and if the depth is increased sufficiently, biodiversity coverage through metagenomics can outperform that of amplicon sequencing (<xref ref-type="bibr" rid="ref89">Shah et al., 2010</xref>; <xref ref-type="bibr" rid="ref91">Shakya et al., 2013</xref>; <xref ref-type="bibr" rid="ref64">Logares et al., 2014</xref>; <xref ref-type="bibr" rid="ref10">Brumfield et al., 2020</xref>).</p>
<p>Total RNA sequencing (total RNA-Seq; <xref ref-type="bibr" rid="ref62">Li et al., 2016</xref>; <xref ref-type="bibr" rid="ref60">Li and Guan, 2017</xref>; <xref ref-type="bibr" rid="ref5">Bang-Andreasen et al., 2020</xref>), also termed double-RNA approach (<xref ref-type="bibr" rid="ref100">Urich et al., 2008</xref>), metatranscriptomics analysis of total rRNA (<xref ref-type="bibr" rid="ref99">Turner et al., 2013</xref>), total RNA metatranscriptomics (<xref ref-type="bibr" rid="ref112">Xue et al., 2020</xref>), or total RNA-seq-based metatranscriptomics (<xref ref-type="bibr" rid="ref60">Li and Guan, 2017</xref>), refers to metatranscriptomics without an mRNA enrichment step. Cellular RNA consists mostly of rRNA, including 16S and 18S rRNA, which means that a large portion of total RNA-Seq data can be used for taxonomic annotations of microbes. In a previous study, we showed that total RNA-Seq can identify a microbial mock community consisting of 10 species more accurately than metagenomics at almost one order of magnitude lower sequencing depth (<xref ref-type="bibr" rid="ref45">Hempel et al., 2022</xref>). Therefore, total RNA-Seq combines the advantages of both amplicon sequencing and metagenomics, as it avoids targeted PCR while producing large amounts of 16S and 18S sequences that can be taxonomically annotated.</p>
<p>Both Metagenomics and metatranscriptomics are more costly than amplicon sequencing but they can deliver target-PCR-free functional and taxonomical information across the tree of life, and as a result, there is a growing interest in their application for ecological assessments (<xref ref-type="bibr" rid="ref101">Uyaguari-Diaz et al., 2016</xref>; <xref ref-type="bibr" rid="ref58">Leese et al., 2018</xref>; <xref ref-type="bibr" rid="ref24">Cordier et al., 2019</xref>, <xref ref-type="bibr" rid="ref21">2021</xref>).</p>
<p>Another field of research increasingly considered for use in ecological assessments is machine learning. Machine learning comprises algorithms to discover structural patterns in data that can be used to make predictions. Learning, in that sense, means that the applied algorithms change their behavior through repeated training so that they perform better going forward (<xref ref-type="bibr" rid="ref107">Witten and Frank, 2005</xref>). Machine learning is increasingly being used in biological sciences, including microbial ecology and environmental assessments, due to its capacity to deal with the expanding scale and complexity of biological data (<xref ref-type="bibr" rid="ref35">Ghannam and Techtmann, 2021</xref>; <xref ref-type="bibr" rid="ref40">Greener et al., 2022</xref>). <xref ref-type="bibr" rid="ref24">Cordier et al. (2019)</xref> stated that machine learning is the most promising approach for routine biomonitoring as it has the potential to be faster, more cost-efficient, and more accurate than current morphology-based methods, and some researchers believe that ecology represents one of the most relevant areas for machine learning because it could solve a wide and diverse variety of ecological problems (<xref ref-type="bibr" rid="ref25">Crisci et al., 2012</xref>). It already has been applied successfully to amplicon-sequencing-based environmental assessments in freshwater (<xref ref-type="bibr" rid="ref92">Smith et al., 2015</xref>; <xref ref-type="bibr" rid="ref39">Good et al., 2018</xref>), marine and coastal water (<xref ref-type="bibr" rid="ref22">Cordier et al., 2017</xref>, <xref ref-type="bibr" rid="ref23">2018</xref>; <xref ref-type="bibr" rid="ref34">Gerhard and Gunsch, 2019</xref>; <xref ref-type="bibr" rid="ref36">Glasl et al., 2019</xref>; <xref ref-type="bibr" rid="ref33">Fr&#x00FC;he et al., 2020</xref>; <xref ref-type="bibr" rid="ref29">Dully et al., 2021</xref>), estuarine sediments (<xref ref-type="bibr" rid="ref55">Lanz&#x00E9;n et al., 2020</xref>), and soil (<xref ref-type="bibr" rid="ref46">Hermans et al., 2020</xref>), overcoming both the complex biological challenges associated with environmental data and the statistical challenges associated with the interpretation of large datasets. However, for the prediction of ecological variables with taxonomically assigned metagenomic data, machine learning has been applied only once so far (<xref ref-type="bibr" rid="ref18">Chang et al., 2017</xref>) and not at all using total RNA-Seq data. To date, High-Throughput Sequencing (HTS) has reached sequencing depths that allow for the application of omics-based approaches in environmental studies; however, it is unclear what scales are required to allow for machine-learning-based environmental stressor predictions. There is a clear need for a comparative assessment of metagenomics, total RNA-Seq, and amplicon sequencing with respect to their ability to provide adequate taxonomic datasets for machine learning approaches.</p>
<p>In this study, we compare the performance of amplicon sequencing, metagenomics, and total RNA-Seq to predict environmental stressor levels based on taxonomically assigned data using machine learning. We used samples obtained from an ExStream system (<xref ref-type="bibr" rid="ref81">Piggott et al., 2015</xref>) consisting of stream mesocosms that were exposed to fine sediment and an insecticide to investigate the impact of these aquatic key stressors on stream biodiversity and the decomposition of organic matter (<xref ref-type="bibr" rid="ref66">Mack et al., 2022</xref>). For amplicon sequencing, we used the two marker genes ITS-2 and 16S, both with an operational taxonomic unit (OTU) clustering and an exact sequence variant (ESV) denoising method. We evaluated the markers individually as well as in combination (multi-marker approach). Stressor prediction performance (SPP) for all datasets was based on different taxonomic levels (phylum, class, order, family, genus, and species), data types (abundance, presence&#x2013;absence (P&#x2013;A)), feature selection (with feature selection, without feature selection), and machine learning algorithms (k-Nearest Neighbors, Linear Support Vector Classification, Logistic Ridge Regression, Logistic Lasso Regression, Multilayer Perceptron, Random Forest, Support Vector Classification, and XGBoost).</p>
</sec>
<sec sec-type="materials|methods" id="sec2">
<label>2</label>
<title>Materials and methods</title>
<p>The overall study design is shown in <xref ref-type="fig" rid="fig1">Figure 1</xref>, and further details are given in the balance of this section.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Summary of the study design. ExStream samples were processed using omics-based methods and amplicon sequencing, and HTS data were processed using two clustering methods (only applicable to amplicon sequencing), six taxonomic levels, two data types, with or without feature selection, and eight machine learning algorithms, for a total of 1,536 evaluated combinations of sequencing and data-processing methods. KNN, k-Nearest Neighbors; Lasso, Logistic Lasso Regression; Ridge, Logistic Ridge Regression; LSVC, Linear Support Vector Classification; MLP, Multilayer Perceptron; RF, Random Forest; SVC, Support Vector Classification; XGB, XGBoost.</p>
</caption>
<graphic xlink:href="fmicb-14-1217750-g001.tif"/>
</fig>
<sec id="sec3">
<label>2.1</label>
<title>Experimental setup</title>
<sec id="sec4">
<label>2.1.1</label>
<title>ExStream system</title>
<p>A detailed explanation of the ExStream system can be found in <xref ref-type="bibr" rid="ref66">Mack et al. (2022)</xref>. In summary, stream mesocosms were connected to the adjacent stream Bieber, which provided them with a constant water flow. The stream Bieber is part of the Rhine-Main-Observatory,<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref> a Long-Term Ecological Research site in Germany (<xref ref-type="bibr" rid="ref41">Haase et al., 2016</xref>; <xref ref-type="bibr" rid="ref71">Mirtl et al., 2018</xref>). Each mesocosm was set up using substrate and organisms from the stream. A random subset of the mesocosms was exposed to either the insecticide chlorantraniliprole (Coragen, DuPont), increased fine sediment concentration, or both. Both insecticides and fine sediment are known key stressors of aquatic environments introduced into streams by agricultural runoff. The stressors were induced using a 4&#x00D7;2 factorial design by adding 0.2&#x2009;&#x03BC;g/L, 2&#x2009;&#x03BC;g/L, and 20&#x2009;&#x03BC;g/L (acute stressor phase, 4&#x2009;days) or 0.02&#x2009;&#x03BC;g/L, 0.2&#x2009;&#x03BC;g/L, and 2&#x2009;&#x03BC;g/L (reduced stressor phase, 17&#x2009;days) of the insecticide and 450&#x2009;mL of fine sediment (&#x003C;2&#x2009;mm) to the mesocosms. Each possible combination of stressor levels was replicated eight times in addition to eight control mesocosms that did not receive any stressor, resulting in 64 mesocosms.</p>
</sec>
<sec id="sec5">
<label>2.1.2</label>
<title>Assessment of microbial community compositions</title>
<p>The goal of the ExStream experiment was to evaluate the individual and combined effects of the applied stressors on biodiversity and organic matter decomposition in streams. To investigate organic matter decomposition, cotton strips were added to all mesocosms. Cotton strips are mainly made of cellulose, which is a major source of carbon in stream ecosystems. Therefore, analyzing the biofilm on the cotton strips allowed the analysis of the diversity of microbial communities degrading organic matter.</p>
<p>The experiment was divided into a colonization phase (days &#x2212;21 to &#x2212;1) and a stressor phase (days 0 to 21). Two cotton strips were added to each of the 64 mesocosms on day &#x2212;17 (128 in total) and recovered after 28 or 35&#x2009;days, respectively for more information on the phases and cotton strip addition and recovery see <xref ref-type="bibr" rid="ref66">Mack et al. (2022)</xref>. Four cotton strips were washed away during the experiment, so 124 cotton strips were recovered in total. A 2-cm-long piece of each cotton strip was cut off and transferred into a ZR BashingBead Lysis Tube (0.1 &#x0026; 0.5&#x2009;mm) pre-filled with 1&#x2009;mL of DNA/RNAShield (Zymo Research, Freiburg, Germany) using sterile laboratory gloves, forceps, and scissors. The samples were transferred to a laboratory, stored at &#x2212;20&#x00B0;C, and then homogenized using a bead mill homogenizer (MM 400, Retsch, Haan, Germany) at 1,800&#x2009;rpm for 30&#x2009;min. 300&#x2009;&#x03BC;L of each lysate were processed for amplicon sequencing at the University of Duisburg-Essen, Germany, and the remainder of each lysate was shipped to the University of Guelph, Canada, on dry ice and processed for metagenomics and total RNA-Seq.</p>
</sec>
</sec>
<sec id="sec6">
<label>2.2</label>
<title>Laboratory processing</title>
<sec id="sec7">
<label>2.2.1</label>
<title>Laboratory processing of amplicon sequencing</title>
<p>Amplicon sequencing was carried out following the workflow described by <xref ref-type="bibr" rid="ref12">Buchner et al. (2021)</xref>. All subsequent processing steps were completed on a Biomek FX<sup>P</sup> liquid handling workstation (Beckman Coulter, Brea, CA, United States). Briefly, replication of the samples was carried out before DNA extraction by transferring 60&#x2009;&#x03BC;L from the bead-beating tubes to deep-well plates pre-filled with 133&#x2009;&#x03BC;L of TNES buffer (50&#x2009;mM Tris, 400&#x2009;mM NaCl, 100&#x2009;mM EDTA, 0.5% SDS, pH 7.5) and 6&#x2009;&#x03BC;L of Proteinase K (10&#x2009;mg/mL) following incubation for 3&#x2009;h at 55&#x00B0;C for complete lysis of the samples. DNA was extracted using a modified version of the NucleoMag Tissue kit Macherey Nagel, D&#x00FC;ren, Germany; for modifications see <xref ref-type="bibr" rid="ref12">Buchner et al. (2021)</xref>. Extraction success was verified using a 1% agarose gel.</p>
<p>The PCR for the amplicon library was performed using a two-step PCR protocol following <xref ref-type="bibr" rid="ref115">Zizka et al. (2019)</xref>. Samples were amplified in a first-step PCR using the Qiagen Multiplex Plus Kit (Qiagen, Hilden, Germany) with a final concentration of 1x Multiplex Mastermix, 200&#x2009;mM of each primer [515F &#x0026; 806R for 16S (<xref ref-type="bibr" rid="ref15">Caporaso et al., 2011</xref>) and ITS3-CS1 &#x0026; ITS4-CS2 for ITS-2 (<xref ref-type="bibr" rid="ref31">Frey et al., 2016</xref>)], and 1&#x2009;&#x03BC;L of DNA, and filled up to a total volume of 10&#x2009;&#x03BC;L with PCR-grade water. The amplification protocol was: 5&#x2009;min of initial denaturation, 25&#x2009;cycles of 30&#x2009;s denaturation at 95&#x00B0;C, 90&#x2009;s of annealing at 50&#x00B0;C for 16S and 55&#x00B0;C for ITS-2, and 30&#x2009;s of extension at 72&#x00B0;C, finished by a final elongation step of 10&#x2009;min at 68&#x00B0;C. For subsequent demultiplexing, each of the PCR plates was tagged with a unique combination of inline tags (<xref ref-type="supplementary-material" rid="SM1">Supplementary File S1</xref>).</p>
<p>The first-step PCR results were cleaned up with magnetic beads. The PCR product was mixed with clean-up buffer (2.5&#x2009;M NaCl, 10&#x2009;mM Tris, 1&#x2009;mM EDTA, 20% PEG 8000, 0.05% Tween 20, 2% carboxylated Sera-Mag SpeedsBeads (Cytiva Life Sciences, Marlborough, MA, United States), pH 8) at a 0.8x ratio and incubated for 5&#x2009;min, washed two times with wash buffer (10&#x2009;mM Tris, 80% EtOH, pH 7.5) for 30&#x2009;s, dried for 5&#x2009;min at RT and finally eluted in 40&#x2009;&#x03BC;L of elution buffer (10&#x2009;mM Tris, pH 8.5).</p>
<p>During the second-step PCR, samples were amplified with a final concentration of 1x Multiplex Mastermix, 1x Coralload Loading Dye, 100&#x2009;mM of each primer, and 2&#x2009;&#x03BC;L of the first-step product. Cycling conditions were the same as in the first-step PCR except for 61&#x00B0;C as annealing temperature and a decreased cycle number of 20. In the second-step PCR, each of the 96 wells was individually tagged so that the combination of the in-line tag from the first-step PCR and the index-read of the second-step PCR yielded a unique combination per sample. PCR success was verified using a 1% agarose gel.</p>
<p>PCR products were normalized to equal concentrations with normalization buffer (same as clean-up buffer, but with only 0.1% beads) following the same protocol as the clean-up after the first step but with a ratio of 0.7x and an elution volume of 50&#x2009;&#x03BC;L. All normalized products were pooled in the final libraries in equal parts. The libraries were concentrated using a silica-membrane spin column (Epoch Life Science, Missouri City, TX, United States) by mixing 1 volume of the library with 2 volumes of binding buffer (3&#x2009;M Guanidine Hydrochloride, 90 EtOH, 10&#x2009;mM Bis-Tris, pH 6) for the binding step (1&#x2009;min centrifugation, 11,000 x g), 2 washing steps (30&#x2009;s centrifugation, 11,000 x g) with wash buffer and a final elution (3&#x2009;min incubation at RT, followed by 1&#x2009;min centrifugation at 11,000 x g) with 100&#x2009;&#x03BC;L elution buffer. Library concentrations were quantified on a Fragment Analyzer (High Sensitivity NGS Fragment Analysis Kit; Advanced Analytical, Ankeny, United States). The libraries were then sequenced using the Illumina MiSeq platform with 2 lanes for each library with a paired-end kit (V2, 2&#x00D7;250 bp for 16S and V3, 2&#x00D7;300 bp for ITS) at CeGat (T&#x00FC;bingen, Germany).</p>
</sec>
<sec id="sec8">
<label>2.2.2</label>
<title>Laboratory processing of metagenomics and total RNA-Seq</title>
<p>DNA and total RNA were separately extracted from samples in 96-well plates using the NucleoMag DNA/RNA Water kit (D-MARK Biosciences, Toronto, Canada) that includes magnetic beads. Instead of using a magnetic plate to separate magnetic beads from buffers, we used the Magnetic Bead Extraction Replicator (V&#x0026;P Scientific, San Diego, United States), which allows for the transfer of all magnetic beads from one lysate/buffer/elution plate to another without the need to remove the supernatant from individual wells.<xref ref-type="fn" rid="fn0002"><sup>2</sup></xref> The RNA extraction protocol involved a 25-min-long rDNase incubation step to digest DNA. Since the 96-well plates were open during the entire extraction, which posed a contamination risk, we added one negative extraction control to each row of each plate by replacing lysate with pure water. All extractions were performed under a sterile hood. DNA/RNA concentrations of all extracts and all negative extraction controls were measured using a Qubit fluorometer with the dsDNA HS Assay Kit and the RNA HS ASSAY Kit, respectively (Thermo Fisher Scientific, Burlington, Canada).</p>
<p>DNA and RNA libraries of all samples and negative extraction controls were prepared for metagenomics and total RNA-Seq using the NEBNext Ultra II DNA Library Prep Kit for Illumina and the NEBNext Ultra II Directional RNA Library Prep Kit for Illumina, respectively (New England Biolabs, Whitby, Canada). For RNA library preps, we did not perform mRNA enrichment or rRNA removal and instead processed the entire RNA. The RNA library prep kit has a default insert size of 200&#x2009;bp, and we chose an insert size of 150&#x2013;350&#x2009;bp for the DNA library preps to keep insert sizes approximately consistent. After library prep, we randomly selected 8 DNA sample libraries, 3 negative DNA extraction control libraries, 7 RNA sample libraries, and 4 negative RNA extraction control libraries and sent 2.5&#x2009;&#x03BC;L of each to the AAC Genomics Facility at the University of Guelph, Canada for analysis on an Agilent Bioanalyzer 2,100 system (Agilent Technologies, United States) to confirm successful library preps and check for contaminations in negative extraction control libraries. After consultation with the sequencing facility (Center for Applied Genomics, Hospital for Sick Children, Toronto, Canada), we cleaned up all DNA and RNA libraries following the DNA/RNA library prep kit manual to remove primer dimers and unincorporated primers.</p>
<p>We pooled 5&#x2009;&#x03BC;L of each DNA and RNA library for sequencing, respectively, including negative extraction controls. We pooled equal volumes instead of equal concentrations because this pooling strategy allows for an equal relative sequencing depth per sample as opposed to an equal total sequencing depth. That way, the relative number of reads per sample mirrored the relative amount of DNA/RNA, avoiding an over- or underrepresentation of samples with higher or lower DNA/RNA amounts. Size distributions of the DNA and RNA library pools were assessed with a bioanalyzer by the sequencing facility, and the average fragment size was 386&#x2009;bp for the DNA library pool and 436&#x2009;bp for the RNA library pool. Both pools were paired-end (2&#x00D7;100 bp) sequenced in a 50:50 ratio on a single lane of a NovaSeq 6,000 SP flowcell.</p>
</sec>
</sec>
<sec id="sec9">
<label>2.3</label>
<title>Bioinformatics</title>
<sec id="sec10">
<label>2.3.1</label>
<title>Bioinformatics of amplicon sequencing</title>
<p>Raw data of the sequencing runs were delivered demultiplexed by index reads. Further demultiplexing by inline tags was done with the Python script &#x201C;demultiplexer&#x201D;.<xref ref-type="fn" rid="fn0003"><sup>3</sup></xref> Sequences were subsequently processed with APSCALE v1.4 (<xref ref-type="bibr" rid="ref13">Buchner et al., 2022</xref>) using default parameters. Paired-end reads were merged using vsearch v2.21.1 (<xref ref-type="bibr" rid="ref86">Rognes et al., 2016</xref>). Primer sequences were trimmed with cutadapt v3.5 (<xref ref-type="bibr" rid="ref68">Martin, 2011</xref>). For 16S sequencing, only sequences with a length of 252&#x2009;&#x00B1;&#x2009;10&#x2009;bp were retained, and for ITS-2 sequencing, only sequences with a length ranging from 240 to 460&#x2009;bp were retained. Only sequences with an expected error of 1 passed quality filtering. Reads were dereplicated and singletons were removed. For OTU generation, sequence clustering was performed with a similarity threshold of 97%, and for ESV generation, denoising was carried out with an alpha value of 2 and a minimum size of 8 as implemented in vsearch. Before taxonomic assignment, the resulting OTU and ESV tables were filtered for potentially biased sequences using the LULU algorithm (<xref ref-type="bibr" rid="ref32">Fr&#x00F8;slev et al., 2017</xref>) implemented in APSCALE.</p>
<p>Subsequently, only OTUs and ESVs found in both replicates of the same sample were summed up for all samples. After this initial data filtering, reads still left in the negative controls were subtracted from OTUs or ESVs, respectively, to generate final OTU and ESV tables. Taxonomic assignment was performed using DADA2 with default parameters in combination with the database SILVA 138.1 designed for DADA2 (<xref ref-type="bibr" rid="ref69">McLaren and Callahan, 2021</xref>) for 16S sequences and the database UNITE (<xref ref-type="bibr" rid="ref1">Abarenkov et al., 2021</xref>) for ITS-2 sequences, respectively.</p>
</sec>
<sec id="sec11">
<label>2.3.2</label>
<title>Bioinformatics of metagenomics and total RNA-Seq</title>
<p>In an earlier study, we investigated 672 combinations of bioinformatic tools to identify the best-performing combination to process and taxonomically annotate microbial mock community datasets (<xref ref-type="bibr" rid="ref45">Hempel et al., 2022</xref>). Based on these results, we processed both metagenomics and total RNA-Seq data as follows: we used Trimmomatic v0.39 (<xref ref-type="bibr" rid="ref9">Bolger et al., 2014</xref>) to trim the leading and trailing low-quality nucleotides of each read by cutting reads if the average quality of nucleotides in a sliding window of size 4 was below a PHRED score of 20. After trimming, we excluded reads shorter than 25 nucleotides and error-corrected reads using the error-correction module of the assembler SPAdes v3.14.1 (<xref ref-type="bibr" rid="ref6">Bankevich et al., 2012</xref>). Then we assembled the reads into scaffolds using MEGAHIT v1.2.9 (<xref ref-type="bibr" rid="ref63">Li et al., 2015</xref>) with the parameter &#x2018;presets&#x2019; set to &#x2018;meta-large&#x2019; to adjust k-mer sizes for the assembly of large and complex metagenomes. All other parameters were set to default. Subsequently, we mapped reads to assembled scaffolds to determine the abundance of each scaffold using BWA v0.7.17 (<xref ref-type="bibr" rid="ref59">Li and Durbin, 2009</xref>) with default parameters. We processed mapped reads using the function <italic>coverage</italic> of samtools v1.10 (<xref ref-type="bibr" rid="ref61">Li et al., 2009</xref>) to obtain the mean per-base coverage for each scaffold. For taxonomic annotation, we used the SILVA132_NR99 SSU and LSU reference databases (<xref ref-type="bibr" rid="ref83">Quast et al., 2013</xref>) in combination with kraken2 v2.1.1 (<xref ref-type="bibr" rid="ref108">Wood et al., 2019</xref>) using default parameters. The setup of the kraken2 database for SILVA required manual adaptations, which are described in the <xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>. All code utilized is available on GitHub.<xref ref-type="fn" rid="fn0004"><sup>4</sup></xref></p>
</sec>
</sec>
<sec id="sec12">
<label>2.4</label>
<title>Pre-processing of taxonomic data</title>
<p>The data were further processed in Python v3.7.9 (<xref ref-type="bibr" rid="ref102">Van Rossum and Drake, 2009</xref>). The full code is available on GitHub<xref ref-type="fn" rid="fn0005"><sup>5</sup></xref> and involves the modules Pandas v1.3.5 (<xref ref-type="bibr" rid="ref85">Reback et al., 2021</xref>) and NumPy v1.21.3 (<xref ref-type="bibr" rid="ref43">Harris et al., 2020</xref>). We trained and evaluated machine learning models based on phylum, class, order, family, genus, and species to assess differences in Stressor prediction performance (SPP) among taxonomic levels. Because both metagenomics and total RNA-Seq datasets consisted of mean per-base coverage while amplicon sequencing datasets consisted of absolute read counts, we employed two different approaches to determine taxa abundances for each taxonomic level. When aggregating metagenomic and total RNA-Seq taxonomic datasets for each level separately, we adjusted taxa abundances for sequencing depth and scaffold length. For that, we selected all scaffolds assigned to each detected taxon and determined each taxon&#x2019;s absolute abundance as follows:</p>
<disp-formula id="E1">
<mml:math id="M1">
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mi mathvariant="italic">perBco</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi mathvariant="italic">taxon</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="italic">covered</mml:mi>
<mml:mspace width="0.25em"/>
<mml:mi mathvariant="italic">bases</mml:mi>
<mml:mspace width="0.25em"/>
<mml:mi mathvariant="italic">across</mml:mi>
<mml:mspace width="0.25em"/>
<mml:mi mathvariant="italic">scaffolds</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">total</mml:mi>
<mml:mspace width="0.25em"/>
<mml:mi mathvariant="italic">bases</mml:mi>
<mml:mspace width="0.25em"/>
<mml:mi mathvariant="italic">across</mml:mi>
<mml:mspace width="0.25em"/>
<mml:mi mathvariant="italic">scaffolds</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mspace width="8.25em"/>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mn>1</mml:mn>
<mml:mi mathvariant="italic">scaf</mml:mi>
</mml:msubsup>
<mml:mi mathvariant="italic">perBco</mml:mi>
<mml:msub>
<mml:mi>v</mml:mi>
<mml:mi mathvariant="italic">scaf</mml:mi>
</mml:msub>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi mathvariant="italic">scaf</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mn>1</mml:mn>
<mml:mi mathvariant="italic">scaf</mml:mi>
</mml:msubsup>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi mathvariant="italic">scaf</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mtext>,</mml:mtext>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
</disp-formula>
<p>where <italic>perBcov<sub>taxon</sub></italic> represents the per-base coverage of a taxon, <italic>scaf</italic> represents the number of scaffolds assigned to a taxon, and <italic>perbcov<sub>scaf</sub></italic> and <italic>len<sub>scaf</sub></italic> represent the per-base coverage and length of each scaffold. We then converted absolute abundances into relative abundances. This process is similar to that for abundance estimation of binned scaffolds (<xref ref-type="bibr" rid="ref75">Parks et al., 2015</xref>).</p>
<p>When aggregating abundances based on amplicon sequencing data for each taxonomic level separately, we determined absolute taxa abundances as the cumulative read count of each detected taxon and converted absolute abundances into relative abundances.</p>
<p>For metagenomics and total RNA-Seq samples, negative extraction controls were subtracted from samples that were co-extracted with the controls. We converted relative abundances into absolute abundances by multiplying relative abundances by the number of reads per sample, summarized the absolute abundances of taxa among all negative extraction controls per plate, and subtracted the cumulative absolute abundance of each taxon detected within controls from the actual samples of the same plate. Afterwards, we reverted absolute abundances back into relative abundances.</p>
<p>We then excluded the taxonomic entry <italic>NA</italic> from all datasets, which represented the relative abundance of sequences that could not be taxonomically annotated, likely due to missing references in databases or sequencing and data-processing errors. Next, we readjusted the relative abundances of all other taxa. In some datasets, some samples consisted only of sequences that could not be taxonomically annotated, meaning that they had a cumulative relative abundance of zero after excluding the <italic>NA</italic> entry. These samples were considered to have failed, and we excluded them from all datasets to ensure that all datasets contained the same samples, which ultimately resulted in 121 samples per dataset.</p>
<p>To assess differences in SPP among data types, we evaluated abundance and P&#x2013;A data. For P&#x2013;A data, we set all relative abundances above 0 to 1 (0&#x2009;=&#x2009;absent, 1&#x2009;=&#x2009;present). For abundance data, we followed the appropriate steps for analyzing compositional data, as pointed out by <xref ref-type="bibr" rid="ref38">Gloor et al. (2017)</xref>. Therefore, we first applied simple multiplicative replacement to replace zeros among all relative abundances using the function <italic>multiplicative_replacement</italic> of the Python module scikit-bio v0.5.6 (<xref ref-type="bibr" rid="ref98">The Scikit-Bio Development Team, 2020</xref>). The function replaces zeros with a small positive value &#x03B4;, which is based on the number of taxa while ensuring that the compositions still add up to 1. Then, we applied a centered log-ratio (clr) transformation using the function <italic>clr</italic> of scikit-bio, which captures the relationships between taxa and makes the data symmetric and linearly related. Since feature standardization is required by some machine learning algorithms, we further standardized taxa abundances using the function <italic>StandardScaler</italic> of the Python module scikit-learn v1.1.1 (<xref ref-type="bibr" rid="ref77">Pedregosa et al., 2011</xref>).</p>
<p>To include a multi-marker approach using both the ITS-2 and 16S marker genes in the evaluations, we combined the generated 16S and ITS-2 datasets by concatenating them using the clustering or denoising method (OTUs or ESVs). This resulted in eight taxonomic datasets that were evaluated (ITS-2 amplicon sequencing clustered into OTUs (ITS-2 OTU) or denoised into ESVs (ITS-2 ESV), 16S amplicon sequencing clustered into OTUs (16S OTU) or denoised into ESVs (16S ESV), multi-marker approach clustered into OTUs (16S&#x2009;+&#x2009;ITS-2 OTU) or denoised into ESVs (16S&#x2009;+&#x2009;ITS-2 ESV), metagenomics, and total RNA-Seq).</p>
</sec>
<sec id="sec13">
<label>2.5</label>
<title>Biodiversity analysis</title>
<p>To analyze the biodiversity detected per taxonomic dataset, we grouped detected taxa using NCBI GenBank taxonomy. We determined the total number of detected taxa per taxonomic dataset, the number of unique taxa detected within only one taxonomic dataset, and the number of overlapping taxa between taxonomic datasets at the phylum, genus, and species level. For that we translated all phyla, genus, and species names within 16S and ITS-2 datasets into NCBI taxonomy to match names across all datasets and utilized reference databases. Specifically, we tested each name for matches with names in the scientific or non-scientific NCBI taxonomy,<xref ref-type="fn" rid="fn0006"><sup>6</sup></xref> and if a match was found, the name was translated into the scientific NCBI name. If no match was found, we manually checked if the respective name was available on NCBI under a different scientific or non-scientific name, and if so, the alternative scientific name was used. Otherwise, the name was not available on NCBI and was used without translation. After translation, taxa containing the terms &#x201C;candidatus,&#x201D; &#x201C;candidate,&#x201D; or &#x201C;[candida]&#x201D; were removed. Then, the number of overlapping taxa between taxonomic datasets was determined as the number of matches between the respective taxa within each taxonomic dataset, and the number of taxa unique to one taxonomic dataset was determined by subtracting the number of overlapping taxa from the total number of detected taxa.</p>
</sec>
<sec id="sec14">
<label>2.6</label>
<title>Machine learning</title>
<sec id="sec15">
<label>2.6.1</label>
<title>Data preprocessing</title>
<p>Taxon abundances/P&#x2013;A represented independent features, and we defined the dependent feature as the combinations of applied insecticide level (none, low, medium, high) and fine sediment addition (normal fine sediment concentration, increased fine sediment concentration) for each sample, resulting in eight classes that were predicted by the machine learning algorithms. Since correlated independent features add noise, we removed them by applying the SULOV (Searching for Uncorrelated List of Variables) algorithm using the function <italic>FE_remove_variables_using_SULOV_method</italic> of the Python module featurewiz v0.1.55,<xref ref-type="fn" rid="fn0007"><sup>7</sup></xref> which identifies all pairs of highly correlated independent features (features with a Pearson correlation coefficient of &#x003E;0.7 or&#x2009;&#x003C;&#x2009;&#x2212;0.7 by default), determines their Mutual Information Score (MIS) to the dependent feature, and keeps the independent feature with the highest MIS for each highly correlated feature pair.</p>
</sec>
<sec id="sec16">
<label>2.6.2</label>
<title>Test-train splitting and feature selection</title>
<p>Each ExStream mesocosm was sampled at two time points as part of the cotton strip assay, which meant that samples consisted of highly related paired samples, i.e., two samples of the same mesocosm. When splitting the data sets into train and test sets, we ensured that paired samples were assigned to the same training and test sets to avoid data leakage between the sets.</p>
<p>Initially, we applied a 90:10 train-test split to the datasets (109 train samples, 12 test samples) and performed training and testing without repetition, but due to large discrepancies between train and test scores, we changed the train-test split ratio to 80:20 (97 train samples, 24 test samples) and repeated both training and testing splits three times in total. During each repetition, we randomly selected 12 pairs (24 samples) of highly related samples for the test dataset and trained and tested all models across all datasets with the same randomly selected 12 sample pairs per repetition.</p>
<p>For feature selection, we used Recursive Feature Elimination to select the 20 most important features using the function <italic>RFE</italic> from scikit-learn with a <italic>DecisionTreeClassifier</italic> as the estimator.</p>
</sec>
<sec id="sec17">
<label>2.6.3</label>
<title>Model selection, training, and testing</title>
<p>It is generally recommended to test multiple machine learning algorithms (<xref ref-type="bibr" rid="ref40">Greener et al., 2022</xref>), which is why we selected eight machine learning algorithms to predict stressor classes: k-Nearest Neighbors (KNN), Linear Support Vector Classification (LSVC), Logistic Ridge Regression (Ridge), Logistic Lasso Regression (Lasso), Multilayer Perceptron (MLP), Random Forest (RF), Support Vector Classification (SVC), and XGBoost (XGB). For thorough descriptions of these algorithms in a biological context see <xref ref-type="bibr" rid="ref40">Greener et al. (2022)</xref> and <xref ref-type="bibr" rid="ref35">Ghannam and Techtmann (2021)</xref>.</p>
<p>All algorithms, except XGBoost, are available in scikit-learn. To run the XGBoost algorithm, we used the Python module xgboost v1.6.1 (<xref ref-type="bibr" rid="ref19">Chen and Guestrin, 2016</xref>), which is compatible with scikit-learn. To optimize hyperparameters while avoiding overfitting, we performed Bayesian hyperparameter optimization with 10-fold cross-validation using the function <italic>BayesSearchCV</italic> of the Python module scikit-optimize v0.9.0.<xref ref-type="fn" rid="fn0008"><sup>8</sup></xref> The function is compatible with scikit-learn and builds a performance probability model for given hyperparameters, which is used to select the most promising hyperparameters through iterative performance evaluations. While not every possible hyperparameter combination is tested that way, this approach provides a good trade-off between optimization results and runtime. Model prediction performance was evaluated using the Matthews Correlation Coefficient (MCC), which ranges from &#x2212;1 to 1, where 1 means perfect predictions/performance, 0 means prediction performance as good as random guessing, and&#x2009;&#x2212;&#x2009;1 means all predictions are wrong, and increments between &#x2212;1 and 1 can be interpreted in the same way as increments of the Pearson correlation coefficient. All hyperparameters tested can be found in the publicly available code<xref ref-type="fn" rid="fn0009"><sup>9</sup></xref> and <xref ref-type="supplementary-material" rid="SM1">Supplementary File S2</xref>. The optimized hyperparameters were then used to train models on the entire training dataset, and model performances to predict classes of the testing dataset were evaluated using the MCC. During training on the entire dataset, learning curves were generated using the <italic>learning_curve</italic> function from scikit-learn. This process was repeated three times, as described above, and the mean average and standard deviation (SD) of the training and test MCC scores across the three repetitions were determined.</p>
<p>We tested each possible combination of taxonomic datasets (ITS-2, 16S, 16S&#x2009;+&#x2009;ITS-2, metagenomics, and total RNA-Seq), clustering or denoising methods (OTU, ESV; only applicable to amplicon sequencing data), taxonomic levels (phylum, class, order, family, genus, and species), data types (abundance, P&#x2013;A), feature selection (with feature selection, without feature selection), and classification algorithms (KNN, Lasso, LSVC, Ridge, MLP, RF, SVC, and XGB), resulting in a total of 1,536 evaluated combinations.</p>
</sec>
</sec>
<sec id="sec18">
<label>2.7</label>
<title>Statistical analysis</title>
<p>We quantified the impact of sequencing types, taxonomic levels, data types, feature selection, and machine learning algorithms on SPP. For that, we converted all sequencing and data-processing methods into binary dummy variables and tested for significant correlations (<italic>p</italic>&#x2009;&#x2264;&#x2009;0.05) between each sequencing and data-processing method and the test MCC by calculating Spearman&#x2019;s rank correlation coefficient using the <italic>spearmanr</italic> function of the Python module SciPy v1.7.1 (<xref ref-type="bibr" rid="ref104">Virtanen et al., 2020</xref>). Additionally, we performed the same test for each sequencing type separately.</p>
</sec>
</sec>
<sec sec-type="results" id="sec19">
<label>3</label>
<title>Results</title>
<sec id="sec20">
<label>3.1</label>
<title>High-throughput sequencing results</title>
<p>We obtained 248,707,817 paired-end reads from metagenomics [mean average per sample: 2&#x2009;M reads, standard deviation (SD): 2.4&#x2009;M reads], 206,096,238 from total RNA-Seq (mean average per sample: 1.7&#x2009;M reads, SD: 2.6&#x2009;M reads), 21,719,985 reads from 16S sequencing (mean average per sample: 152&#x2009;k reads, SD: 27&#x2009;k reads), and 27,033,469 reads from ITS-2 sequencing (mean average per sample: 214&#x2009;k reads, SD: 41&#x2009;k reads; <xref ref-type="supplementary-material" rid="SM1">Supplementary Figure S1</xref>; Bioproject number: PRJNA903104, SRA accession numbers: SRR22331748&#x2013;SRR22332597). The SD of the mean average number of metagenomics and total RNA-Seq reads per sample was very high because we normalized metagenomics and total RNA-Seq libraries based on volume during library preparation so that the relative number of reads per sample mirrored the relative amount of DNA/RNA. This avoided an over-or underrepresentation of samples with higher or lower amounts of DNA/RNA but also led to substantial variations in the number of reads per metagenomics/total RNA-Seq library (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figure S1</xref>).</p>
</sec>
<sec id="sec21">
<label>3.2</label>
<title>Biodiversity analysis</title>
<p>There were no taxa overlaps between ITS-2 and 16S sequencing at the phylum, genus, and species level (<xref ref-type="fig" rid="fig2">Figure 2</xref>, for exact numbers, see <xref ref-type="supplementary-material" rid="SM1">Supplementary File S3</xref>), while either method had overlapping taxa with both metagenomics and total RNA-Seq. Metagenomics and total RNA-Seq shared more taxa with each other than with ITS-2 or 16S sequencing. Metagenomics detected by far the most phyla (95), genera, (2488), and species (3,522), and the number of genera and species detected using metagenomics was much higher relative to that of other taxonomic datasets than the number of detected phyla. For total RNA-Seq, the number of detected phyla (76) was more than three times as high as that of ITS-2 (OTU: 23, ESV: 20) and 16S sequencing (OTU: 25, ESV: 24), the number of detected genera (903) was 1.3&#x2013;2.9 times as high as that of ITS-2 sequencing (OTU: 678, ESV: 491) and 16S sequencing (OTU: 315, ESV: 363), and the number of detected species (892) was 1.3&#x2013;1.8 times as high as that of ITS-2 sequencing (OTU: 673, ESV: 506) and much higher than that of 16S sequencing (OTU: 55, ESV: 114). 16S sequencing detected almost the same number of phyla as ITS-2 sequencing but by far the lowest number of genera and species among all taxonomic datasets. In terms of taxa unique to one taxonomic dataset, metagenomics detected by far more unique phyla (19), genera (1,399), and species (2660) than all other all other taxonomic datasets combined. Within ITS-2 and 16S sequencing, OTU clustering and ESV denoising resulted in different numbers of detected taxa, specifically for ITS-2 sequencing at genus level (OTU: 678, ESV: 491) and for 16S sequencing on the species level (OTU: 55, ESV: 114). 16S sequencing detected much less taxa at species level than at genus level. In terms of the distribution of taxonomic groups, 16S sequencing recovered almost exclusively bacterial taxa, while ITS-2 sequencing recovered not only taxa in the group &#x201C;plants and fungi&#x201D; but also invertebrate taxa. Omics-based methods recovered taxa across all groups, and they detected more bacterial taxa than 16S sequencing at all three taxonomic levels. At genus and species level, bacterial taxa represented most detected taxa.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Number of total, unique, and overlapping taxa for each taxonomic dataset on the phylum, genus, and species level (chord diagrams), as well as the distribution of taxonomic groups within each taxonomic dataset (pie charts). In the chord diagrams, the size of the outer bars represents the total number of detected taxa, the size of the connections between taxonomic datasets represents the number of overlapping taxa, and the fraction of outer bars with no connection to other taxonomic datasets represents the number of unique taxa detected only in that taxonomic dataset.</p>
</caption>
<graphic xlink:href="fmicb-14-1217750-g002.tif"/>
</fig>
</sec>
<sec id="sec22">
<label>3.3</label>
<title>Impact of taxonomic datasets and data-processing methods on SPP</title>
<p>SPP varied substantially across tested combinations of taxonomic datasets, clustering or denoising methods, taxonomic levels, machine learning algorithms, and feature selection (<xref ref-type="fig" rid="fig3">Figure 3</xref>; since data types had no significant impact on SPP (see <xref ref-type="fig" rid="fig4">Figures 4</xref>, <xref ref-type="fig" rid="fig5">5</xref>), only P&#x2013;A-based SPPs are shown). MCC values ranged from below 0 (prediction SPP worse than random guessing) to 0.45 (moderate to good SPP). Feature selection overall improved SPP. ITS-2 sequencing and omics-based methods performed poorly overall, except for some combinations of ITS-2 sequencing with OTU clustering, whereas 16S sequencing and the multi-marker approach of combined 16S and ITS-2 markers performed better overall. The highest MCC of 0.45 was found for the following combination: 16S&#x2009;+&#x2009;ITS-2 sequencing, ESV denoising, genus level, P&#x2013;A data, Lasso algorithm, with feature selection. For this combination, the learning curves generated during each training repetition indicated that the model was overfitted, meaning that more data, i.e., more samples would have likely further increased SPP (<xref ref-type="supplementary-material" rid="SM1">Supplementary Figure S2</xref>).</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>MCC as a proxy for SPP across all combinations of sequencing and data-processing methods tested. Since data types had no significant impact on SPP (see <xref ref-type="fig" rid="fig4">Figures 4</xref>, <xref ref-type="fig" rid="fig5">5</xref>), only P&#x2013;A-based SPPs are shown.</p>
</caption>
<graphic xlink:href="fmicb-14-1217750-g003.tif"/>
</fig>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Correlation between MCC as a proxy for SPP and sequencing and data-processing methods.</p>
</caption>
<graphic xlink:href="fmicb-14-1217750-g004.tif"/>
</fig>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Correlation between MCC as a proxy for SPP and data-processing methods for individual taxonomic datasets.</p>
</caption>
<graphic xlink:href="fmicb-14-1217750-g005.tif"/>
</fig>
<p>Overall, ITS-2 sequencing, metagenomics, and total RNA-Seq significantly negatively correlated with SPP, and 16S sequencing and combined 16S&#x2009;+&#x2009;ITS-2 markers significantly positively correlated with SPP (<xref ref-type="fig" rid="fig4">Figure 4</xref>). For amplicon sequencing, OTU clustering significantly increased SPP while ESV denoising significantly decreased SPP. Performance increased with increasing taxonomic resolution up to the order level and decreased at higher levels. Data types did not significantly correlate with SPP. Feature selection significantly increased SPP. SPPs varied between machine learning algorithms, with XGB performing by far the worst and Lasso and Ridge, which are both based on logistic regression, performing the best, followed by MLP.</p>
<p>The impact of data-processing methods on SPP varied between individual taxonomic datasets (<xref ref-type="fig" rid="fig5">Figure 5</xref>). For ITS-2 ESV, the species level was significantly positively correlated with SPP, which contrasted with all other taxonomic datasets. For metagenomics, no taxonomic level significantly correlated with SPP. Data types did not significantly correlate with SPP in any taxonomic dataset. Feature selection had the strongest impact on metagenomics and no significant impact on 16S OTU/ESV and 16S&#x2009;+&#x2009;ITS-2 OTU. Across all taxonomic datasets, XGB performed poorly. Lasso and Ridge performed significantly well for all taxonomic datasets except metagenomics, total RNA-Seq, and ITS-2 ESV. Overall, the impact of data-processing methods was similar between 16S OTU/ESV, 16S&#x2009;+&#x2009;ITS-2 OTU/ESV, and ITS-2 OTU and differed between metagenomics, total RNA-Seq, and ITS-2 ESV.</p>
</sec>
</sec>
<sec sec-type="discussion" id="sec23">
<label>4</label>
<title>Discussion</title>
<sec id="sec24">
<label>4.1</label>
<title>Biodiversity analysis</title>
<p>The number of total, unique, and overlapping taxa varied substantially between taxonomic datasets. ITS-2 and 16S sequencing had no taxa overlap, confirming that both markers were group-specific; however, while 16S sequencing was almost exclusively specific to bacteria, ITS-2 sequencing detected not only taxa belonging to the NCBI division &#x201C;plants and fungi&#x201D; but also invertebrate taxa, indicating that the applied ITS-2 primers were not specific to fungi. Metagenomics and total RNA-Seq had overlapping taxa with ITS-2 and 16S sequencing but also detected a high number of taxa that the latter did not detect, and both methods detected bacterial, invertebrate, plant and fungal taxa, confirming that omics-based methods can recover groups across the tree of life, which is considered a major advantage over amplicon sequencing (<xref ref-type="bibr" rid="ref91">Shakya et al., 2013</xref>; <xref ref-type="bibr" rid="ref10">Brumfield et al., 2020</xref>; <xref ref-type="bibr" rid="ref73">Obiol et al., 2020</xref>). Many taxa found with total RNA-Seq were also found with metagenomics, but the latter also found an extremely high number of unique taxa. However, at genus and species level, ITS-2 sequencing detected a high number of unique taxa as well. These taxa were not recovered by omics-based methods, potentially because we only utilized SSU and LSU references for taxonomic annotation of omics-based sequences, or because ITS-2 sequencing has a higher taxonomic resolution within fungi than omics-based methods at our utilized sequencing depth. In contrast, omics-based methods found much more bacterial species, genera, and even phyla than 16S sequencing. While metagenomics can identify bacterial taxa at the species or even strain level given sufficient sequencing depth, 16S sequencing is often limited to bacterial genus level identifications (<xref ref-type="bibr" rid="ref51">Knight et al., 2018</xref>), which could explain why 16S sequencing detected fewer bacterial species than genera and fewer bacterial species than omics-based methods. However, the fact that omics-based methods also detected much more bacterial taxa at genus and even phylum level shows that either the taxonomic resolution of omics-based methods outperformed that of 16S sequencing for bacteria or that these methods detected a high number of false-positive bacterial taxa. There is no clear consensus in the literature as to which of those methods detect more taxa, with some studies showing that amplicon sequencing detects more taxa than omics-based methods (<xref ref-type="bibr" rid="ref93">Stat et al., 2017</xref>; <xref ref-type="bibr" rid="ref97">Tessler et al., 2017</xref>), while others show that both methods detect equal amounts of taxa (<xref ref-type="bibr" rid="ref17">Chan et al., 2015</xref>; <xref ref-type="bibr" rid="ref73">Obiol et al., 2020</xref>) or that omics-based methods outperform amplicon sequencing in terms of biodiversity coverage (<xref ref-type="bibr" rid="ref91">Shakya et al., 2013</xref>; <xref ref-type="bibr" rid="ref56">Laudadio et al., 2018</xref>; <xref ref-type="bibr" rid="ref113">Yan et al., 2018</xref>; <xref ref-type="bibr" rid="ref10">Brumfield et al., 2020</xref>). Biodiversity coverage also depends on how well an environment is represented in reference databases, and for less-studied environments that are poorly represented in reference databases, it is possible that the majority of omics-based sequences cannot be taxonomically annotated, resulting in low overall taxonomic resolution (<xref ref-type="bibr" rid="ref93">Stat et al., 2017</xref>). Our results support both hypotheses: (1) omics-based methods detect more taxa overall, and (2) amplicon sequencing detects more taxa within target groups, at least for fungi, which aligns with the advantages and disadvantages of either approach. In theory, all taxa detected with amplicon sequencing should also have been detected with omics-based methods, but our results indicate that sequencing depth for omics-based methods must be increased substantially to be able to detect the same taxa. Tools and databases that incorporate references from more taxonomic markers to identify omics-based sequences should also be further explored. However, given continuous technological advancements in HTS capacities, sufficient sequencing depths should become more affordable, and in combination with the steady growth of reference databases, we expect omics-based methods to unilaterally detect more taxa than amplicon sequencing at equal or higher taxonomic resolution in the future.</p>
</sec>
<sec id="sec25">
<label>4.2</label>
<title>Impact of sequencing methods on SPP</title>
<p>SPP varied substantially among taxonomic datasets. 16S sequencing was the only standalone method positively correlated with SPP, and combining 16S with ITS-2 sequencing data slightly improved SPP. We expected omics-based methods to outperform amplicon sequencing because the former are not group-specific and can cover biodiversity across the tree of life, providing a more complete picture of microbial communities; however, the opposite was the case, indicating that while omics-based methods did detect more taxa, they also missed crucial taxa, detected taxa without correlation to stressors, and/or generated more noise, which decreased SPP. This was further supported by the fact that the SPP of metagenomics, which detected the highest number of taxa, improved substantially under feature selection, i.e., the exclusion of all but the 20 most relevant taxa for model performance. However, even with feature selection, metagenomics still showed poor overall SPP, indicating that the feature-selected taxa did not include crucial taxa, did not correlate with stressors, or were poorly represented. This could be a result of insufficient sequencing depth, possibly causing insufficient recovery of taxa, or of the utilized reference database (SILVA), which only contains SSU and LSU sequences and no other commonly used markers or whole genome sequences, decreasing the likelihood of finding a taxonomic match among omics-based sequences.</p>
<p>Typical metagenomics experiments aim to generate between 1 and 10 Gb of metagenomic data per sample (<xref ref-type="bibr" rid="ref84">Quince et al., 2017</xref>) while we generated on average 0.2 Gb metagenomic data per sample, which is one to two magnitudes lower. Increasing the sequencing depth of omics-based methods to ensure that taxa with high bioindication potential are sufficiently represented might increase SPP but is currently also related to substantially higher costs. In previous studies, we showed that total RNA-Seq outperformed metagenomics in identifying a microbial community and reconstructing SSU rRNA sequences (<xref ref-type="bibr" rid="ref45">Hempel et al., 2022</xref>, <xref ref-type="bibr" rid="ref44">2023</xref>) at lower sequencing depth and, therefore, costs, likely due to higher SSU rRNA sequence yield when using total RNA-Seq. Therefore, for the present study, we expected that total RNA-Seq would have a higher SPP than metagenomics at comparably low sequencing depth (on average 0.17 Gb total RNA-Seq data per sample). However, total RNA-Seq performed even worse, indicating that even the sequencing depth of total RNA-Seq was too low.</p>
<p>The poor performance of metagenomics could also be related to the fact that only SSU and LSU reference sequences were used for taxonomic annotation instead of all available markers or whole genome sequences to utilize all available metagenomic information. In the present study, we compared metagenomics and total RNA-Seq explicitly due to the aforementioned advantages of total RNA-Seq in regard to SSU and LSU rRNA coverage. Therefore, testing databases and tools that incorporate more markers or whole genome sequences for taxonomic annotation, such as MetaPhlAn (<xref ref-type="bibr" rid="ref8">Blanco-M&#x00ED;guez et al., 2023</xref>) or the NCBI Genbank database, was out of scope for this study; however, due to the poor performance of both omics-based methods, these options should be further explored in similar future studies.</p>
<p>Almost all studies that utilize machine learning for taxonomically assigned HTS data in an ecological context involve amplicon sequencing (<xref ref-type="bibr" rid="ref92">Smith et al., 2015</xref>; <xref ref-type="bibr" rid="ref22">Cordier et al., 2017</xref>, <xref ref-type="bibr" rid="ref23">2018</xref>; <xref ref-type="bibr" rid="ref34">Gerhard and Gunsch, 2019</xref>; <xref ref-type="bibr" rid="ref33">Fr&#x00FC;he et al., 2020</xref>; <xref ref-type="bibr" rid="ref46">Hermans et al., 2020</xref>; <xref ref-type="bibr" rid="ref29">Dully et al., 2021</xref>), and to our knowledge, there is only one study that involves metagenomics in that context (<xref ref-type="bibr" rid="ref18">Chang et al., 2017</xref>) and none that compare amplicon sequencing with omics-based methods. However, in a medical context, <xref ref-type="bibr" rid="ref67">Marcos-Zambrano et al. (2021)</xref> provide a thorough overview of human microbiome studies that utilize machine learning for HTS data. While they list seven studies that applied machine learning to both amplicon sequencing and metagenomics data, only one of them compared the performance of both sequencing methods based on community composition (<xref ref-type="bibr" rid="ref28">Douglas et al., 2018</xref>), showing that amplicon sequencing outperformed metagenomics in classifying patients and the state of Crohn&#x2019;s disease while metagenomics outperformed amplicon sequencing in classifying treatment response. These results further demonstrate that SPP is dependent on the environmental variables investigated. Multiple other medical studies utilizing machine learning for disease predictions based on metagenomics community compositions show good SPP for predicting colorectal cancer, inflammatory bowel disease, diabetes, rheumatoid arthritis, and liver cirrhosis (<xref ref-type="bibr" rid="ref42">Hacilar et al., 2018</xref>; <xref ref-type="bibr" rid="ref110">Wu et al., 2018</xref>; <xref ref-type="bibr" rid="ref2">Ai et al., 2019</xref>). These studies clearly show the potential of omics-based methods for medical applications, and further omics-based ecological research with sufficient sequencing depth is required to show if the methods hold the same potential for environmental stressor predictions.</p>
</sec>
<sec id="sec26">
<label>4.3</label>
<title>Impact of data-processing methods on SPP</title>
<p>Data-processing methods had a substantial impact on SPP, and based on the utilized methods, SPP could range from low to high within one taxonomic dataset.</p>
<sec id="sec27">
<label>4.3.1</label>
<title>Impact of clustering and denoising methods on SPP</title>
<p>For amplicon sequencing data, OTU clustering significantly improved SPP while ESV denoising significantly decreased SPP. This observation is in contrast to the emerging recommendation to denoise amplicon sequences into ESVs (<xref ref-type="bibr" rid="ref14">Callahan et al., 2017</xref>; <xref ref-type="bibr" rid="ref51">Knight et al., 2018</xref>). Studies comparing OTU clustering and ESV denoising approaches did not yet reach a consensus, showing that either both approaches lead to similar results (<xref ref-type="bibr" rid="ref37">Glassman and Martiny, 2018</xref>; <xref ref-type="bibr" rid="ref103">Vera-Gargallo et al., 2019</xref>; <xref ref-type="bibr" rid="ref48">Kang et al., 2021</xref>), ESV denoising outperforms OTU clustering (<xref ref-type="bibr" rid="ref16">Caruso et al., 2019</xref>; <xref ref-type="bibr" rid="ref94">Tapolczai et al., 2019</xref>; <xref ref-type="bibr" rid="ref47">Joos et al., 2020</xref>), or vice versa (<xref ref-type="bibr" rid="ref87">Roy et al., 2019</xref>; <xref ref-type="bibr" rid="ref96">Tedersoo et al., 2022</xref>). Our results support the latter, although more similar studies are required to determine if clustering or denoising is more appropriate for machine-learning-based environmental predictions using microbial communities.</p>
</sec>
<sec id="sec28">
<label>4.3.2</label>
<title>Impact of taxonomic levels on SPP</title>
<p>In general, a higher taxonomic resolution provides a better picture of microbial communities, but our results show that the species level correlated worse with SPP than genus, family, order, and even class levels. For ITS-2 sequencing and omics-based methods, the high number of detected taxa at the species level might have added more noise than value to the data. This is indicated by the significantly positive impact of feature selection on SPP, i.e., the limitation of the number of included taxa. However, for 16S sequencing, feature selection had no impact on SPP while the species level still negatively correlated with SPP. This result may be related to the number of sequences that could not be assigned to the species level and were consequently dropped. The lower the taxonomic level considered, the harder it is to annotate taxonomy due to the lack of reference sequences in databases, and the more sequences are dropped from the downstream analysis. In microbiome amplicon sequencing studies, the taxonomic resolution is usually limited to the genus level due to the difficulty in designing primers that resolve microbial communities at the species level (<xref ref-type="bibr" rid="ref51">Knight et al., 2018</xref>). Metagenomics allows for taxonomic resolutions at the species level or even strain level, but this requires sufficient sequencing depth (<xref ref-type="bibr" rid="ref51">Knight et al., 2018</xref>). Dropping sequences from the analysis is equivalent to a loss of information, which could have decreased SPP at the species level. It is also possible that correlations between taxa and environmental variables are higher at lower taxonomic levels because lower taxonomic groups can be overall ecologically coherent, i.e., share similar physiologies, while higher taxonomic groups can be ecologically incoherent and have very different physiologies (<xref ref-type="bibr" rid="ref80">Philippot et al., 2010</xref>; <xref ref-type="bibr" rid="ref20">Choe et al., 2021</xref>; <xref ref-type="bibr" rid="ref4">Auladell et al., 2022</xref>). Once reference databases have been extensively expanded and most sequences can be taxonomically annotated, it will be possible to determine if the lack of reference sequences or ecological incoherency of species explains lower SPP at the species level.</p>
</sec>
<sec id="sec29">
<label>4.3.3</label>
<title>Impact of data types on SPP</title>
<p>We were surprised that the data types (abundance/P&#x2013;A) did not have an impact on SPP, given that many studies focus on methods to improve abundance estimates from HTS data (<xref ref-type="bibr" rid="ref27">Dillies et al., 2013</xref>; <xref ref-type="bibr" rid="ref38">Gloor et al., 2017</xref>; <xref ref-type="bibr" rid="ref106">Weiss et al., 2017</xref>; <xref ref-type="bibr" rid="ref78">Pereira et al., 2018</xref>). The difference in abundance and P&#x2013;A data lies in the weight of the taxa; in P&#x2013;A data, abundant and rare taxa are weighted equally, making the data more sensitive to noise but also to subtle differences in community composition. Using simulated data, <xref ref-type="bibr" rid="ref53">Koh et al. (2019)</xref> demonstrated that P&#x2013;A data is more powerful when taxa associated with an environmental variable are rare while abundance data is more powerful when those taxa are abundant. However, a large-scale morphological study on benthic invertebrates showed that ecological status classifications based on abundance and P&#x2013;A data showed only minor variations (<xref ref-type="bibr" rid="ref11">Buchner et al., 2019</xref>). In a microbial context, multiple HTS studies showed similar correlations of both abundance and P&#x2013;A data with environmental variables (<xref ref-type="bibr" rid="ref72">Muletz Wolz et al., 2018</xref>; <xref ref-type="bibr" rid="ref52">Knowles et al., 2019</xref>; <xref ref-type="bibr" rid="ref30">Farinella et al., 2022</xref>), while some studies showed that correlations differed between data types (<xref ref-type="bibr" rid="ref49">Kask et al., 2020</xref>; <xref ref-type="bibr" rid="ref95">Tavalire et al., 2021</xref>). These results indicate that the impact of data types might depend on the studied environmental variables, but if further research shows that both data types have similar predictive power for environmental assessments, as our results suggest, then P&#x2013;A data could be used exclusively in future environmental assessment studies. This would avoid the rather complex and partially disagreeing statistical methods required when working with compositional data, i.e., HTS abundance data (<xref ref-type="bibr" rid="ref27">Dillies et al., 2013</xref>; <xref ref-type="bibr" rid="ref38">Gloor et al., 2017</xref>; <xref ref-type="bibr" rid="ref106">Weiss et al., 2017</xref>; <xref ref-type="bibr" rid="ref78">Pereira et al., 2018</xref>). Furthermore, if abundance and P&#x2013;A data generate similar results, then the often-stated advantage of metagenomics to generate abundance data free from target PCR bias (<xref ref-type="bibr" rid="ref51">Knight et al., 2018</xref>; <xref ref-type="bibr" rid="ref50">Khachatryan et al., 2020</xref>) would become irrelevant, which would decrease the value of omics-based approaches in comparison to amplicon sequencing.</p>
</sec>
<sec id="sec30">
<label>4.3.4</label>
<title>Impact of feature selection on SPP</title>
<p>Feature selection can be applied to microbial data to remove noninformative, noisy, or redundant features (<xref ref-type="bibr" rid="ref35">Ghannam and Techtmann, 2021</xref>). This is generally recommended because the high number of observed features can increase the risk of overfitting, which is described as the &#x201C;curse of dimensionality&#x201D; (<xref ref-type="bibr" rid="ref74">Oudah and Henschel, 2018</xref>). However, feature selection goes against the proposed idea that a more holistic picture of environmental microbial communities is beneficial for predicting environmental variables, as it reduces the number of taxa included in prediction models. Our results suggest that feature selection improves SPP overall and especially for metagenomics, while the SPP of 16S sequencing was not impacted by feature selection. This indicates that the increased biodiversity coverage of omics-based methods might in fact not be beneficial for machine learning predictions and that datasets covering a lower number of taxa, as generated by amplicon sequencing, might result in more accurate and precise predictions. It should be noted, though, that ITS-2 sequencing detected approximately as many species as total RNA-Seq, and feature selection did increase the SPP of ITS-2 sequencing, showing that amplicon sequencing can also be significantly impacted by feature selection. Furthermore, the sequencing depth of metagenomics and total RNA-Seq in our study was very low, which could have influenced the impact of feature selection. If similar studies with a sufficient sequencing depth come to the same conclusion that omics-based methods in fact detect too many taxa for accurate and precise environmental assessments and require feature selection, then this would strongly tip the balance in favor of amplicon sequencing.</p>
</sec>
<sec id="sec31">
<label>4.3.5</label>
<title>Impact of machine learning algorithms on SPP</title>
<p>Machine learning algorithms had a substantial impact on SPP, and even when applying two different algorithms to the same data set, the resulting MCC could range from 0.38 to &#x2212;0.05. This illustrates the importance of testing multiple machine learning algorithms, which is recommended in general (<xref ref-type="bibr" rid="ref40">Greener et al., 2022</xref>). One of the most commonly applied machine learning classification algorithms for HTS data is RF (<xref ref-type="bibr" rid="ref92">Smith et al., 2015</xref>; <xref ref-type="bibr" rid="ref33">Fr&#x00FC;he et al., 2020</xref>; <xref ref-type="bibr" rid="ref46">Hermans et al., 2020</xref>; <xref ref-type="bibr" rid="ref55">Lanz&#x00E9;n et al., 2020</xref>; <xref ref-type="bibr" rid="ref29">Dully et al., 2021</xref>; <xref ref-type="bibr" rid="ref35">Ghannam and Techtmann, 2021</xref>; <xref ref-type="bibr" rid="ref67">Marcos-Zambrano et al., 2021</xref>), which reveals which feature contributed most to a prediction. Other popular algorithms are XGB, Support Vector Machines (which include SVC and LSVC), Logistic Regression, and KNN (<xref ref-type="bibr" rid="ref35">Ghannam and Techtmann, 2021</xref>; <xref ref-type="bibr" rid="ref67">Marcos-Zambrano et al., 2021</xref>; <xref ref-type="bibr" rid="ref40">Greener et al., 2022</xref>). However, among those algorithms, RF and (L)SVC did not significantly correlate with SPP in our study, while XGB and KNN significantly negatively correlated with SPP and only logistic regression, specifically Lasso and Ridge, significantly positively correlated with SPP. Linear algorithms have the lowest flexibility among all popular machine learning algorithms, since they assume only linear relationships, and while other algorithms can assume non-linear relationships, which increases their flexibility and is often considered beneficial for the analysis of large and complex data, this was not the case for our study. In contrast, MLP, which represents a simple neural network (NN) with the highest flexibility among all algorithms tested in our study, performed overall the best after Lasso and Ridge and specifically the best for omics-based methods that generated the largest datasets. NNs are currently among the most powerful machine learning algorithms for the analysis of extremely large data, and their impact is so significant that an entirely new field of research emerged around NNs, called deep learning (<xref ref-type="bibr" rid="ref40">Greener et al., 2022</xref>). To unfold their potential, NNs require large amounts of samples that usually go beyond the number of samples generated in a single biological study. However, thousands of sampling sites are monitored for routine environmental assessments, and once the broad application of omics-based methods becomes more affordable, it will be interesting to see if NNs are required for good SPP based on omics data or if less complex machine learning algorithms will be sufficient or even more appropriate.</p>
<p>Overall, our study shows that data-processing methods should be chosen carefully since they can have a high impact on SPP and that methods resulting in the single best SPP are not necessarily the most appropriate overall. Therefore, we conclude that it is advisable to explore multiple sequencing and, in particular, data-processing methods to maximize prediction performance.</p>
</sec>
</sec>
<sec id="sec32">
<label>4.4</label>
<title>Perspectives for ecological assessments</title>
<p>The highest MCC, i.e., the best SPP observed in our study was 0.45, indicating moderate to good performance. This is promising, but stressor predictions must be more accurate and precise to reach the standard for applied ecological assessments. However, while the stressors tested in our study (insecticide and increased fine sediment deposition) have direct negative effects on typical indicator organisms (e.g., benthic macroinvertebrates), little is known about their effects on microbial communities. Since many microbes are a good indicator of ecosystem health and respond sensitively to stressors, we expected a shift of the microbial communities under exposure to insecticide and increased fine sediment deposition, at least due to indirect top-down effects caused by the reduced abundance of benthic macroinvertebrates that typically graze on cotton strips. But it is also possible that direct or indirect effects of the stressors on microbes were too low to cause a sufficient shift in microbial communities for taxonomy-based stressor predictions or even that increased fine sediment deposition was beneficial for microbial communities because it provided additional surface habitat for microbes or stimulated organic matter decomposition through physical abrasion of the cotton strips. Therefore, our observed insufficient SPP could also be a consequence of stressor choice rather than limitations of sequencing depth or machine learning, especially since other studies show good performance of machine learning models for environmental assessments based on amplicon sequencing (<xref ref-type="bibr" rid="ref22">Cordier et al., 2017</xref>, <xref ref-type="bibr" rid="ref23">2018</xref>; <xref ref-type="bibr" rid="ref34">Gerhard and Gunsch, 2019</xref>; <xref ref-type="bibr" rid="ref33">Fr&#x00FC;he et al., 2020</xref>; <xref ref-type="bibr" rid="ref29">Dully et al., 2021</xref>).</p>
<p><xref ref-type="bibr" rid="ref92">Smith et al. (2015)</xref> showed that the performance of prediction models can highly vary based on the predicted environmental variables (including stressor variables). When they attempted to predict 38 geochemical groundwater variables based on 16S sequencing data, the predicted and actual values of 26 variables significantly correlated with each other while those of 12 variables did not. This was further supported by <xref ref-type="bibr" rid="ref46">Hermans et al. (2020)</xref>, who predicted seven soil variables based on 16S sequencing data, and the correlations between predicted and actual values ranged from weak to strong and were further dependent on the land use type of the investigated samples. This raises the need for more exploratory research using different stressors until machine learning can be broadly applied to ecological assessments that involve many stressors.</p>
<p>Nevertheless, the learning curves generated for our best model indicate that more samples likely would have increased SPP. This result is promising because it shows that further sampling likely would have revealed subtle yet distinctive community shifts that would have allowed for better predictions without requiring further knowledge about the direct or indirect effects of the stressors on microbes, which further highlights the potential of machine learning for HTS-based environmental assessments given sufficient sampling size.</p>
<p>We have only investigated the taxonomic information generated by metagenomics and total RNA-Seq, but both methods also generate information on functional diversity (metagenomics) and differential gene expression (total RNA-Seq). This information can also be integrated, which is why omics-based methods are gaining increased attention for environmental assessments (<xref ref-type="bibr" rid="ref101">Uyaguari-Diaz et al., 2016</xref>; <xref ref-type="bibr" rid="ref58">Leese et al., 2018</xref>; <xref ref-type="bibr" rid="ref24">Cordier et al., 2019</xref>, <xref ref-type="bibr" rid="ref21">2021</xref>), and it remains to be tested to what extent SPP can be increased by integrating taxonomical and functional information.</p>
</sec>
<sec id="sec33">
<label>4.5</label>
<title>Conclusion</title>
<p>We demonstrate that sequencing and data-processing methods have a substantial impact on environmental stressor prediction when applying machine learning to taxonomically assigned HTS data. Omics-based methods detected much more taxa than amplicon sequencing, and while this is considered an advantage, amplicon sequencing, specifically 16S sequencing, outperformed all other sequencing methods in terms of stressor prediction performance (SPP). However, the best observed SPP for 16S sequencing was only moderate to good, meaning that further improvements are necessary to meet the required standard for applied ecological assessments. Nevertheless, learning curves indicated that more samples would likely have increased SPP, demonstrating the potential for further research. Omics-based methods performed poorly, possibly due to insufficient sequencing depth or a too shallow taxonomic resolution of crucial taxa, but given that other studies demonstrated the potential of omics-based methods in combination with machine learning, further omics-based ecological research is required to show if this approach holds potential for environmental stressor predictions. Data types had no impact on SPP while feature selection significantly improved SPP for omics-based methods but not for amplicon sequencing, and if similar studies confirm these results, then this would strongly favor the application of amplicon sequencing over omics-based methods for environmental assessments. However, we only investigated taxonomic information, but omics-based methods also generate functional information, and it remains to be tested whether the integration of taxonomic and functional information can further improve omics-based environmental assessments.</p>
</sec>
</sec>
<sec sec-type="data-availability" id="sec34">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found at: <ext-link xlink:href="https://www.ncbi.nlm.nih.gov/" ext-link-type="uri">https://www.ncbi.nlm.nih.gov/</ext-link>, PRJNA903104.</p>
</sec>
<sec sec-type="author-contributions" id="sec35">
<title>Author contributions</title>
<p>DB, LM, MB, and FL designed the experiment. DB, LM, and MB conducted the experiment and collected the samples. DB, LM, and CH processed the samples. DB and CH processed the sequencing data. CH and DT analyzed the data. CH drafted the manuscript. All authors read and approved the final manuscript.</p>
</sec>
</body>
<back>
<sec sec-type="funding-information" id="sec36">
<title>Funding</title>
<p>CH was funded through the Canada First Research Excellence Fund to the program CFREF&#x2013;Food from Thought at the University of Guelph. DB, MB, and the field experiment were funded through the DFG grants LE 2323/9-1, MA, and SCHA. LM was funded through the Land2Sea project (Aquatic Ecosystem Services in a Changing World, <ext-link xlink:href="https://land2sea.ucd.ie/" ext-link-type="uri">https://land2sea.ucd.ie/</ext-link>; funded under the Joint BiodivERsA-Belmont Forum call and the DFG) and the DFG project LE2323/9-1/MA XXXXX 418091530.</p>
</sec>
<ack>
<p>We are grateful to Christoph Mayer, Peter Haase, and Ralf Sch&#x00E4;fer for their support during the grant application, and we thank Verena Schreiner for performing the pesticide analysis and Romana Salis for performing the taxonomic annotation of amplicon sequencing data.</p>
</ack>
<sec sec-type="COI-statement" id="sec37">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="sec100" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec38">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fmicb.2023.1217750/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fmicb.2023.1217750/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.xlsx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Data_Sheet_2.xlsx" id="SM2" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Data_Sheet_3.xlsx" id="SM3" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Data_Sheet_4.docx" id="SM4" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<fn-group>
<fn id="fn0001">
<p><sup>1</sup><ext-link xlink:href="https://deims.org/9f9ba137-342d-4813-ae58-a60911c3abc1" ext-link-type="uri">https://deims.org/9f9ba137-342d-4813-ae58-a60911c3abc1</ext-link></p>
</fn>
<fn id="fn0002">
<p><sup>2</sup>For the modified protocol, see <ext-link xlink:href="http://dx.doi.org/10.17504/protocols.io.bp2l69n2dlqe/v1" ext-link-type="uri">dx.doi.org/10.17504/protocols.io.bp2l69n2dlqe/v1</ext-link></p>
</fn>
<fn id="fn0003">
<p><sup>3</sup>v1.1.0, <ext-link xlink:href="https://github.com/DominikBuchner/demultiplexer" ext-link-type="uri">https://github.com/DominikBuchner/demultiplexer</ext-link></p>
</fn>
<fn id="fn0004">
<p><sup>4</sup><ext-link xlink:href="https://github.com/hempelc/metagenomics-vs-totalRNASeq" ext-link-type="uri">https://github.com/hempelc/metagenomics-vs-totalRNASeq</ext-link></p>
</fn>
<fn id="fn0005">
<p><sup>5</sup><ext-link xlink:href="https://github.com/hempelc/exstream-metagenomics-totalrnaseq-ml" ext-link-type="uri">https://github.com/hempelc/exstream-metagenomics-totalrnaseq-ml</ext-link></p>
</fn>
<fn id="fn0006">
<p><sup>6</sup>NCBI taxonomy file <italic>names.dmp</italic>, available through the NCBI archive as part of <italic>taxdmp.zip</italic>, <ext-link xlink:href="https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/" ext-link-type="uri">https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/</ext-link></p>
</fn>
<fn id="fn0007">
<p><sup>7</sup><ext-link xlink:href="https://github.com/AutoViML/featurewiz" ext-link-type="uri">https://github.com/AutoViML/featurewiz</ext-link></p>
</fn>
<fn id="fn0008">
<p><sup>8</sup><ext-link xlink:href="https://github.com/scikit-optimize/scikit-optimize" ext-link-type="uri">https://github.com/scikit-optimize/scikit-optimize</ext-link></p>
</fn>
<fn id="fn0009">
<p><sup>9</sup><ext-link xlink:href="https://github.com/hempelc/exstream-metagenomics-totalrnaseq-ml" ext-link-type="uri">https://github.com/hempelc/exstream-metagenomics-totalrnaseq-ml</ext-link></p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="ref1"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Abarenkov</surname> <given-names>K.</given-names></name> <name><surname>Zirk</surname> <given-names>A.</given-names></name> <name><surname>Piirmann</surname> <given-names>T.</given-names></name> <name><surname>P&#x00F6;h&#x00F6;nen</surname> <given-names>R.</given-names></name> <name><surname>Ivanov</surname> <given-names>F.</given-names></name> <name><surname>Nilsson</surname> <given-names>R. H.</given-names></name> <etal/></person-group>. (<year>2021</year>). <italic>UNITE general FASTA release for eukaryotes</italic>.</citation></ref>
<ref id="ref2"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ai</surname> <given-names>D.</given-names></name> <name><surname>Pan</surname> <given-names>H.</given-names></name> <name><surname>Han</surname> <given-names>R.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Liu</surname> <given-names>G.</given-names></name> <name><surname>Xia</surname> <given-names>L. C.</given-names></name></person-group> (<year>2019</year>). <article-title>Using decision tree aggregation with random forest model to identify gut microbes associated with colorectal cancer</article-title>. <source>Genes (Basel)</source> <volume>10</volume>:<fpage>112</fpage>. doi: <pub-id pub-id-type="doi">10.3390/genes10020112</pub-id>, PMID: <pub-id pub-id-type="pmid">30717284</pub-id></citation></ref>
<ref id="ref3"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Almeida</surname> <given-names>O. G. G.</given-names></name> <name><surname>De Martinis</surname> <given-names>E. C. P.</given-names></name></person-group> (<year>2019</year>). <article-title>Bioinformatics tools to assess metagenomic data for applied microbiology</article-title>. <source>Appl. Microbiol. Biotechnol.</source> <volume>103</volume>, <fpage>69</fpage>&#x2013;<lpage>82</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s00253-018-9464-9</pub-id>, PMID: <pub-id pub-id-type="pmid">30362076</pub-id></citation></ref>
<ref id="ref4"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Auladell</surname> <given-names>A.</given-names></name> <name><surname>Barber&#x00E1;n</surname> <given-names>A.</given-names></name> <name><surname>Logares</surname> <given-names>R.</given-names></name> <name><surname>Garc&#x00E9;s</surname> <given-names>E.</given-names></name> <name><surname>Gasol</surname> <given-names>J. M.</given-names></name> <name><surname>Ferrera</surname> <given-names>I.</given-names></name></person-group> (<year>2022</year>). <article-title>Seasonal niche differentiation among closely related marine bacteria</article-title>. <source>ISME J.</source> <volume>16</volume>, <fpage>178</fpage>&#x2013;<lpage>189</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41396-021-01053-2</pub-id>, PMID: <pub-id pub-id-type="pmid">34285363</pub-id></citation></ref>
<ref id="ref5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bang-Andreasen</surname> <given-names>T.</given-names></name> <name><surname>Anwar</surname> <given-names>M. Z.</given-names></name> <name><surname>Lanz&#x00E9;n</surname> <given-names>A.</given-names></name> <name><surname>Kj&#x00F8;ller</surname> <given-names>R.</given-names></name> <name><surname>R&#x00F8;nn</surname> <given-names>R.</given-names></name> <name><surname>Ekelund</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Total RNA sequencing reveals multilevel microbial community changes and functional responses to wood ash application in agricultural and forest soil</article-title>. <source>FEMS Microbiol. Ecol.</source> <volume>96</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. doi: <pub-id pub-id-type="doi">10.1093/femsec/fiaa016</pub-id></citation></ref>
<ref id="ref6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bankevich</surname> <given-names>A.</given-names></name> <name><surname>Nurk</surname> <given-names>S.</given-names></name> <name><surname>Antipov</surname> <given-names>D.</given-names></name> <name><surname>Gurevich</surname> <given-names>A. A.</given-names></name> <name><surname>Dvorkin</surname> <given-names>M.</given-names></name> <name><surname>Kulikov</surname> <given-names>A. S.</given-names></name> <etal/></person-group>. (<year>2012</year>). <article-title>SPAdes: a new genome assembly algorithm and its applications to single-cell sequencing</article-title>. <source>J. Comput. Biol.</source> <volume>19</volume>, <fpage>455</fpage>&#x2013;<lpage>477</lpage>. doi: <pub-id pub-id-type="doi">10.1089/cmb.2012.0021</pub-id>, PMID: <pub-id pub-id-type="pmid">22506599</pub-id></citation></ref>
<ref id="ref7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bashiardes</surname> <given-names>S.</given-names></name> <name><surname>Zilberman-Schapira</surname> <given-names>G.</given-names></name> <name><surname>Elinav</surname> <given-names>E.</given-names></name></person-group> (<year>2016</year>). <article-title>Use of metatranscriptomics in microbiome research</article-title>. <source>Bioinform. Biol. Insights</source> <volume>10</volume>, <fpage>19</fpage>&#x2013;<lpage>25</lpage>. doi: <pub-id pub-id-type="doi">10.4137/BBI.S34610</pub-id>, PMID: <pub-id pub-id-type="pmid">27127406</pub-id></citation></ref>
<ref id="ref8"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Blanco-M&#x00ED;guez</surname> <given-names>A.</given-names></name> <name><surname>Beghini</surname> <given-names>F.</given-names></name> <name><surname>Cumbo</surname> <given-names>F.</given-names></name> <name><surname>McIver</surname> <given-names>L. J.</given-names></name> <name><surname>Thompson</surname> <given-names>K. N.</given-names></name> <name><surname>Zolfo</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Extending and improving metagenomic taxonomic profiling with uncharacterized species using MetaPhlAn 4</article-title>. <source>Nat. Biotechnol.</source> <volume>41</volume>, <fpage>1633</fpage>&#x2013;<lpage>1644</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41587-023-01688-w</pub-id>, PMID: <pub-id pub-id-type="pmid">36823356</pub-id></citation></ref>
<ref id="ref9"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bolger</surname> <given-names>A. M.</given-names></name> <name><surname>Lohse</surname> <given-names>M.</given-names></name> <name><surname>Usadel</surname> <given-names>B.</given-names></name></person-group> (<year>2014</year>). <article-title>Trimmomatic: a flexible trimmer for Illumina sequence data</article-title>. <source>Bioinformatics</source> <volume>30</volume>, <fpage>2114</fpage>&#x2013;<lpage>2120</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btu170</pub-id>, PMID: <pub-id pub-id-type="pmid">24695404</pub-id></citation></ref>
<ref id="ref10"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Brumfield</surname> <given-names>K. D.</given-names></name> <name><surname>Huq</surname> <given-names>A.</given-names></name> <name><surname>Colwell</surname> <given-names>R. R.</given-names></name> <name><surname>Olds</surname> <given-names>J. L.</given-names></name> <name><surname>Leddy</surname> <given-names>M. B.</given-names></name></person-group> (<year>2020</year>). <article-title>Microbial resolution of whole genome shotgun and 16S amplicon metagenomic sequencing using publicly available NEON data</article-title>. <source>PLoS One</source> <volume>15</volume>, <fpage>1</fpage>&#x2013;<lpage>21</lpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0228899</pub-id></citation></ref>
<ref id="ref11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Buchner</surname> <given-names>D.</given-names></name> <name><surname>Beermann</surname> <given-names>A. J.</given-names></name> <name><surname>Laini</surname> <given-names>A.</given-names></name> <name><surname>Rolauffs</surname> <given-names>P.</given-names></name> <name><surname>Vitecek</surname> <given-names>S.</given-names></name> <name><surname>Hering</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Analysis of 13,312 benthic invertebrate samples from German streams reveals minor deviations in ecological status class between abundance and presence/absence data</article-title>. <source>PLoS One</source> <volume>14</volume>, <fpage>1</fpage>&#x2013;<lpage>18</lpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0226547</pub-id></citation></ref>
<ref id="ref12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Buchner</surname> <given-names>D.</given-names></name> <name><surname>Beermann</surname> <given-names>A. J.</given-names></name> <name><surname>Leese</surname> <given-names>F.</given-names></name> <name><surname>Weiss</surname> <given-names>M.</given-names></name></person-group> (<year>2021</year>). <article-title>Cooking small and large portions of &#x201C;biodiversity-soup&#x201D;: miniaturized DNA metabarcoding PCRs perform as good as large-volume PCRs</article-title>. <source>Ecol. Evol.</source> <volume>11</volume>, <fpage>9092</fpage>&#x2013;<lpage>9099</lpage>. doi: <pub-id pub-id-type="doi">10.1002/ece3.7753</pub-id></citation></ref>
<ref id="ref13"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Buchner</surname> <given-names>D.</given-names></name> <name><surname>Macher</surname> <given-names>T.-H.</given-names></name> <name><surname>Leese</surname> <given-names>F.</given-names></name></person-group> (<year>2022</year>). <article-title>APSCALE: advanced pipeline for simple yet comprehensive analyses of DNA Meta-barcoding data</article-title>. <source>Bioinformatics</source> <volume>7</volume>, <fpage>1</fpage>&#x2013;<lpage>3</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btac588</pub-id></citation></ref>
<ref id="ref14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Callahan</surname> <given-names>B. J.</given-names></name> <name><surname>McMurdie</surname> <given-names>P. J.</given-names></name> <name><surname>Holmes</surname> <given-names>S. P.</given-names></name></person-group> (<year>2017</year>). <article-title>Exact sequence variants should replace operational taxonomic units in marker-gene data analysis</article-title>. <source>ISME J.</source> <volume>11</volume>, <fpage>2639</fpage>&#x2013;<lpage>2643</lpage>. doi: <pub-id pub-id-type="doi">10.1038/ismej.2017.119</pub-id>, PMID: <pub-id pub-id-type="pmid">28731476</pub-id></citation></ref>
<ref id="ref15"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Caporaso</surname> <given-names>J. G.</given-names></name> <name><surname>Lauber</surname> <given-names>C. L.</given-names></name> <name><surname>Walters</surname> <given-names>W. A.</given-names></name> <name><surname>Berg-Lyons</surname> <given-names>D.</given-names></name> <name><surname>Lozupone</surname> <given-names>C. A.</given-names></name> <name><surname>Turnbaugh</surname> <given-names>P. J.</given-names></name> <etal/></person-group>. (<year>2011</year>). <article-title>Global patterns of 16S rRNA diversity at a depth of millions of sequences per sample</article-title>. <source>Proc. Natl. Acad. Sci. U. S. A.</source> <volume>108</volume>, <fpage>4516</fpage>&#x2013;<lpage>4522</lpage>. doi: <pub-id pub-id-type="doi">10.1073/pnas.1000080107</pub-id>, PMID: <pub-id pub-id-type="pmid">20534432</pub-id></citation></ref>
<ref id="ref16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Caruso</surname> <given-names>V.</given-names></name> <name><surname>Song</surname> <given-names>X.</given-names></name> <name><surname>Asquith</surname> <given-names>M.</given-names></name> <name><surname>Karstens</surname> <given-names>L.</given-names></name></person-group> (<year>2019</year>). <article-title>Performance of microbiome sequence inference methods in environments with varying biomass</article-title>. <source>mSystems</source> <volume>4</volume>:<fpage>e00163</fpage>. doi: <pub-id pub-id-type="doi">10.1128/msystems.00163-18</pub-id></citation></ref>
<ref id="ref17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chan</surname> <given-names>C. S.</given-names></name> <name><surname>Chan</surname> <given-names>K. G.</given-names></name> <name><surname>Tay</surname> <given-names>Y. L.</given-names></name> <name><surname>Chua</surname> <given-names>Y. H.</given-names></name> <name><surname>Goh</surname> <given-names>K. M.</given-names></name></person-group> (<year>2015</year>). <article-title>Diversity of thermophiles in a Malaysian hot spring determined using 16S rRNA and shotgun metagenome sequencing</article-title>. <source>Front. Microbiol.</source> <volume>6</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. doi: <pub-id pub-id-type="doi">10.3389/fmicb.2015.00177</pub-id></citation></ref>
<ref id="ref18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chang</surname> <given-names>H. X.</given-names></name> <name><surname>Haudenshield</surname> <given-names>J. S.</given-names></name> <name><surname>Bowen</surname> <given-names>C. R.</given-names></name> <name><surname>Hartman</surname> <given-names>G. L.</given-names></name></person-group> (<year>2017</year>). <article-title>Metagenome-wide association study and machine learning prediction of bulk soil microbiome and crop productivity</article-title>. <source>Front. Microbiol.</source> <volume>8</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.3389/fmicb.2017.00519</pub-id></citation></ref>
<ref id="ref19"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T.</given-names></name> <name><surname>Guestrin</surname> <given-names>C.</given-names></name></person-group> (<year>2016</year>). <italic>XGBoost: a scalable tree boosting system</italic>. In: Proceedings of the 22nd ACM SIGKDD international conference on knowledge discovery and data mining. New York. 785&#x2013;794.</citation></ref>
<ref id="ref20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Choe</surname> <given-names>Y. H.</given-names></name> <name><surname>Kim</surname> <given-names>M.</given-names></name> <name><surname>Lee</surname> <given-names>Y. K.</given-names></name></person-group> (<year>2021</year>). <article-title>Distinct microbial communities in adjacent rock and soil substrates on a high Arctic Polar Desert</article-title>. <source>Front. Microbiol.</source> <volume>11</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. doi: <pub-id pub-id-type="doi">10.3389/fmicb.2020.607396</pub-id></citation></ref>
<ref id="ref21"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cordier</surname> <given-names>T.</given-names></name> <name><surname>Alonso-S&#x00E1;ez</surname> <given-names>L.</given-names></name> <name><surname>Apoth&#x00E9;loz-Perret-Gentil</surname> <given-names>L.</given-names></name> <name><surname>Aylagas</surname> <given-names>E.</given-names></name> <name><surname>Bohan</surname> <given-names>D. A.</given-names></name> <name><surname>Bouchez</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Ecosystems monitoring powered by environmental genomics: a review of current strategies with an implementation roadmap</article-title>. <source>Mol. Ecol.</source> <volume>30</volume>, <fpage>2937</fpage>&#x2013;<lpage>2958</lpage>. doi: <pub-id pub-id-type="doi">10.1111/mec.15472</pub-id>, PMID: <pub-id pub-id-type="pmid">32416615</pub-id></citation></ref>
<ref id="ref22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cordier</surname> <given-names>T.</given-names></name> <name><surname>Esling</surname> <given-names>P.</given-names></name> <name><surname>Lejzerowicz</surname> <given-names>F.</given-names></name> <name><surname>Visco</surname> <given-names>J. A.</given-names></name> <name><surname>Ouadahi</surname> <given-names>A.</given-names></name> <name><surname>Martins</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Predicting the ecological quality status of marine environments from eDNA Metabarcoding data using supervised machine learning</article-title>. <source>Environ. Sci. Technol.</source> <volume>51</volume>, <fpage>9118</fpage>&#x2013;<lpage>9126</lpage>. doi: <pub-id pub-id-type="doi">10.1021/acs.est.7b01518</pub-id>, PMID: <pub-id pub-id-type="pmid">28665601</pub-id></citation></ref>
<ref id="ref23"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cordier</surname> <given-names>T.</given-names></name> <name><surname>Forster</surname> <given-names>D.</given-names></name> <name><surname>Dufresne</surname> <given-names>Y.</given-names></name> <name><surname>Martins</surname> <given-names>C. I. M.</given-names></name> <name><surname>Stoeck</surname> <given-names>T.</given-names></name> <name><surname>Pawlowski</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <article-title>Supervised machine learning outperforms taxonomy-based environmental DNA metabarcoding applied to biomonitoring</article-title>. <source>Mol. Ecol. Resour.</source> <volume>18</volume>, <fpage>1381</fpage>&#x2013;<lpage>1391</lpage>. doi: <pub-id pub-id-type="doi">10.1111/1755-0998.12926</pub-id>, PMID: <pub-id pub-id-type="pmid">30014577</pub-id></citation></ref>
<ref id="ref24"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cordier</surname> <given-names>T.</given-names></name> <name><surname>Lanz&#x00E9;n</surname> <given-names>A.</given-names></name> <name><surname>Apoth&#x00E9;loz-Perret-Gentil</surname> <given-names>L.</given-names></name> <name><surname>Stoeck</surname> <given-names>T.</given-names></name> <name><surname>Pawlowski</surname> <given-names>J.</given-names></name></person-group> (<year>2019</year>). <article-title>Embracing environmental genomics and machine learning for routine biomonitoring</article-title>. <source>Trends Microbiol.</source> <volume>27</volume>, <fpage>387</fpage>&#x2013;<lpage>397</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.tim.2018.10.012</pub-id>, PMID: <pub-id pub-id-type="pmid">30554770</pub-id></citation></ref>
<ref id="ref25"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Crisci</surname> <given-names>C.</given-names></name> <name><surname>Ghattas</surname> <given-names>B.</given-names></name> <name><surname>Perera</surname> <given-names>G.</given-names></name></person-group> (<year>2012</year>). <article-title>A review of supervised machine learning algorithms and their applications to ecological data</article-title>. <source>Ecol. Model.</source> <volume>240</volume>, <fpage>113</fpage>&#x2013;<lpage>122</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ecolmodel.2012.03.001</pub-id>, PMID: <pub-id pub-id-type="pmid">26786791</pub-id></citation></ref>
<ref id="ref26"><citation citation-type="other"><person-group person-group-type="author"><name><surname>D&#x00ED;az</surname> <given-names>S.</given-names></name> <name><surname>Settele</surname> <given-names>J.</given-names></name> <name><surname>Brond&#x00ED;zio</surname> <given-names>E. S.</given-names></name> <name><surname>Ngo</surname> <given-names>H. T.</given-names></name> <name><surname>Gu&#x00E8;ze</surname> <given-names>M.</given-names></name> <name><surname>Agard</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2019</year>). <italic>Summary for policymakers of the global assessment report on biodiversity and ecosystem services of the intergovernmental science-policy platform on biodiversity and ecosystem services</italic>. Bonn, Germany.</citation></ref>
<ref id="ref27"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dillies</surname> <given-names>M. A.</given-names></name> <name><surname>Rau</surname> <given-names>A.</given-names></name> <name><surname>Aubert</surname> <given-names>J.</given-names></name> <name><surname>Hennequet-Antier</surname> <given-names>C.</given-names></name> <name><surname>Jeanmougin</surname> <given-names>M.</given-names></name> <name><surname>Servant</surname> <given-names>N.</given-names></name> <etal/></person-group>. (<year>2013</year>). <article-title>A comprehensive evaluation of normalization methods for Illumina high-throughput RNA sequencing data analysis</article-title>. <source>Brief. Bioinform.</source> <volume>14</volume>, <fpage>671</fpage>&#x2013;<lpage>683</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bib/bbs046</pub-id>, PMID: <pub-id pub-id-type="pmid">22988256</pub-id></citation></ref>
<ref id="ref28"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Douglas</surname> <given-names>G. M.</given-names></name> <name><surname>Hansen</surname> <given-names>R.</given-names></name> <name><surname>Jones</surname> <given-names>C. M. A.</given-names></name> <name><surname>Dunn</surname> <given-names>K. A.</given-names></name> <name><surname>Comeau</surname> <given-names>A. M.</given-names></name> <name><surname>Bielawski</surname> <given-names>J. P.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Multi-omics differentially classify disease state and treatment outcome in pediatric Crohn&#x2019; s disease</article-title>. <source>Microbiome</source> <volume>6</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. doi: <pub-id pub-id-type="doi">10.1186/s40168-018-0398-3</pub-id></citation></ref>
<ref id="ref29"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dully</surname> <given-names>V.</given-names></name> <name><surname>Balliet</surname> <given-names>H.</given-names></name> <name><surname>Fr&#x00FC;he</surname> <given-names>L.</given-names></name> <name><surname>D&#x00E4;umer</surname> <given-names>M.</given-names></name> <name><surname>Thielen</surname> <given-names>A.</given-names></name> <name><surname>Gallie</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Robustness, sensitivity and reproducibility of eDNA metabarcoding as an environmental biomonitoring tool in coastal salmon aquaculture&#x2013;an inter-laboratory study</article-title>. <source>Ecol. Indic.</source> <volume>121</volume>:<fpage>7049</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ecolind.2020.107049</pub-id></citation></ref>
<ref id="ref30"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Farinella</surname> <given-names>R.</given-names></name> <name><surname>Rizzato</surname> <given-names>C.</given-names></name> <name><surname>Bottai</surname> <given-names>D.</given-names></name> <name><surname>Bedini</surname> <given-names>A.</given-names></name> <name><surname>Gemignani</surname> <given-names>F.</given-names></name> <name><surname>Landi</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Maternal anthropometric variables and clinical factors shape neonatal microbiome</article-title>. <source>Sci. Rep.</source> <volume>12</volume>, <fpage>1</fpage>&#x2013;<lpage>10</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-022-06792-6</pub-id></citation></ref>
<ref id="ref31"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Frey</surname> <given-names>B.</given-names></name> <name><surname>Rime</surname> <given-names>T.</given-names></name> <name><surname>Phillips</surname> <given-names>M.</given-names></name> <name><surname>Stierli</surname> <given-names>B.</given-names></name> <name><surname>Hajdas</surname> <given-names>I.</given-names></name> <name><surname>Widmer</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>Microbial diversity in European alpine permafrost and active layers</article-title>. <source>FEMS Microbiol. Ecol.</source> <volume>92</volume>, <fpage>1</fpage>&#x2013;<lpage>17</lpage>. doi: <pub-id pub-id-type="doi">10.1093/femsec/fiw018</pub-id></citation></ref>
<ref id="ref32"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fr&#x00F8;slev</surname> <given-names>T. G.</given-names></name> <name><surname>Kj&#x00F8;ller</surname> <given-names>R.</given-names></name> <name><surname>Bruun</surname> <given-names>H. H.</given-names></name> <name><surname>Ejrn&#x00E6;s</surname> <given-names>R.</given-names></name> <name><surname>Brunbjerg</surname> <given-names>A. K.</given-names></name> <name><surname>Pietroni</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Algorithm for post-clustering curation of DNA amplicon data yields reliable biodiversity estimates</article-title>. <source>Nat. Commun.</source> <volume>8</volume>:<fpage>312</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-017-01312-x</pub-id>, PMID: <pub-id pub-id-type="pmid">29084957</pub-id></citation></ref>
<ref id="ref33"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fr&#x00FC;he</surname> <given-names>L.</given-names></name> <name><surname>Cordier</surname> <given-names>T.</given-names></name> <name><surname>Dully</surname> <given-names>V.</given-names></name> <name><surname>Breiner</surname> <given-names>H.-W.</given-names></name> <name><surname>Lentendu</surname> <given-names>G.</given-names></name> <name><surname>Pawlowski</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Supervised machine learning is superior to indicator value inference in monitoring the environmental impacts of salmon aquaculture using eDNA metabarcodes</article-title>. <source>Mol. Ecol.</source> <volume>30</volume>, <fpage>2988</fpage>&#x2013;<lpage>3006</lpage>. doi: <pub-id pub-id-type="doi">10.1111/mec.15434</pub-id></citation></ref>
<ref id="ref34"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gerhard</surname> <given-names>W. A.</given-names></name> <name><surname>Gunsch</surname> <given-names>C. K.</given-names></name></person-group> (<year>2019</year>). <article-title>Metabarcoding and machine learning analysis of environmental DNA in ballast water arriving to hub ports</article-title>. <source>Environ. Int.</source> <volume>124</volume>, <fpage>312</fpage>&#x2013;<lpage>319</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.envint.2018.12.038</pub-id>, PMID: <pub-id pub-id-type="pmid">30660844</pub-id></citation></ref>
<ref id="ref35"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ghannam</surname> <given-names>R. B.</given-names></name> <name><surname>Techtmann</surname> <given-names>S. M.</given-names></name></person-group> (<year>2021</year>). <article-title>Machine learning applications in microbial ecology, human microbiome studies, and environmental monitoring</article-title>. <source>Comput. Struct. Biotechnol. J.</source> <volume>19</volume>, <fpage>1092</fpage>&#x2013;<lpage>1107</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.csbj.2021.01.028</pub-id>, PMID: <pub-id pub-id-type="pmid">33680353</pub-id></citation></ref>
<ref id="ref36"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Glasl</surname> <given-names>B.</given-names></name> <name><surname>Bourne</surname> <given-names>D. G.</given-names></name> <name><surname>Frade</surname> <given-names>P. R.</given-names></name> <name><surname>Thomas</surname> <given-names>T.</given-names></name> <name><surname>Schaffelke</surname> <given-names>B.</given-names></name> <name><surname>Webster</surname> <given-names>N. S.</given-names></name></person-group> (<year>2019</year>). <article-title>Microbial indicators of environmental perturbations in coral reef ecosystems</article-title>. <source>Microbiome</source> <volume>7</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. doi: <pub-id pub-id-type="doi">10.1186/s40168-019-0705-7</pub-id></citation></ref>
<ref id="ref37"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Glassman</surname> <given-names>S. I.</given-names></name> <name><surname>Martiny</surname> <given-names>J. B. H.</given-names></name></person-group> (<year>2018</year>). <article-title>Broadscale ecological patterns are robust to use of exact</article-title>. <source>mSphere</source> <volume>3</volume>:<fpage>e00148</fpage>. doi: <pub-id pub-id-type="doi">10.1128/mSphere.00148-18</pub-id>, PMID: <pub-id pub-id-type="pmid">30021874</pub-id></citation></ref>
<ref id="ref38"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gloor</surname> <given-names>G. B.</given-names></name> <name><surname>Macklaim</surname> <given-names>J. M.</given-names></name> <name><surname>Pawlowsky-Glahn</surname> <given-names>V.</given-names></name> <name><surname>Egozcue</surname> <given-names>J. J.</given-names></name></person-group> (<year>2017</year>). <article-title>Microbiome datasets are compositional: and this is not optional</article-title>. <source>Front. Microbiol.</source> <volume>8</volume>, <fpage>1</fpage>&#x2013;<lpage>6</lpage>. doi: <pub-id pub-id-type="doi">10.3389/fmicb.2017.02224</pub-id></citation></ref>
<ref id="ref39"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Good</surname> <given-names>S. P.</given-names></name> <name><surname>Urycki</surname> <given-names>D. R.</given-names></name> <name><surname>Crump</surname> <given-names>B. C.</given-names></name></person-group> (<year>2018</year>). <article-title>Predicting hydrologic function with aquatic gene fragments</article-title>. <source>Water Resour. Res.</source> <volume>54</volume>, <fpage>2424</fpage>&#x2013;<lpage>2435</lpage>. doi: <pub-id pub-id-type="doi">10.1002/2017WR021974</pub-id></citation></ref>
<ref id="ref40"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Greener</surname> <given-names>J. G.</given-names></name> <name><surname>Kandathil</surname> <given-names>S. M.</given-names></name> <name><surname>Moffat</surname> <given-names>L.</given-names></name> <name><surname>Jones</surname> <given-names>D. T.</given-names></name></person-group> (<year>2022</year>). <article-title>A guide to machine learning for biologists</article-title>. <source>Nat. Rev. Mol. Cell Biol.</source> <volume>23</volume>, <fpage>40</fpage>&#x2013;<lpage>55</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41580-021-00407-0</pub-id>, PMID: <pub-id pub-id-type="pmid">34518686</pub-id></citation></ref>
<ref id="ref41"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Haase</surname> <given-names>P.</given-names></name> <name><surname>Frenzel</surname> <given-names>M.</given-names></name> <name><surname>Klotz</surname> <given-names>S.</given-names></name> <name><surname>Musche</surname> <given-names>M.</given-names></name> <name><surname>Stoll</surname> <given-names>S.</given-names></name></person-group> (<year>2016</year>). <article-title>The long-term ecological research (LTER) network: relevance, current status, future perspective and examples from marine, freshwater and terrestrial long-term observation</article-title>. <source>Ecol. Indic.</source> <volume>100</volume>, <fpage>1</fpage>&#x2013;<lpage>3</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ecolind.2016.01.040</pub-id></citation></ref>
<ref id="ref42"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hacilar</surname> <given-names>H.</given-names></name> <name><surname>Nalbantoglu</surname> <given-names>O. U.</given-names></name> <name><surname>Bakir-Gungor</surname> <given-names>B.</given-names></name></person-group> (<year>2018</year>). <article-title>Machine learning analysis of inflammatory bowel disease-associated metagenomics dataset. UBMK 2018-3rd Int</article-title>. <source>Conf. Comput. Sci. Eng.</source> <volume>2018</volume>, <fpage>434</fpage>&#x2013;<lpage>438</lpage>. doi: <pub-id pub-id-type="doi">10.1109/UBMK.2018.8566487</pub-id></citation></ref>
<ref id="ref43"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Harris</surname> <given-names>C. R.</given-names></name> <name><surname>Millman</surname> <given-names>K. J.</given-names></name> <name><surname>Van der Walt</surname> <given-names>S. J.</given-names></name> <name><surname>Gommers</surname> <given-names>R.</given-names></name> <name><surname>Virtanen</surname> <given-names>P.</given-names></name> <name><surname>Cournapeau</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Array programming with NumPy</article-title>. <source>Nature</source> <volume>585</volume>, <fpage>357</fpage>&#x2013;<lpage>362</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41586-020-2649-2</pub-id>, PMID: <pub-id pub-id-type="pmid">32939066</pub-id></citation></ref>
<ref id="ref44"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hempel</surname> <given-names>C. A.</given-names></name> <name><surname>Carson</surname> <given-names>S. E. E.</given-names></name> <name><surname>Elliott</surname> <given-names>T. A.</given-names></name> <name><surname>Adamowicz</surname> <given-names>S. J.</given-names></name> <name><surname>Steinke</surname> <given-names>D.</given-names></name></person-group> (<year>2023</year>). <article-title>Reconstruction of small subunit ribosomal RNA from high-throughput sequencing data: a comparative study of metagenomics and total RNA sequencing</article-title>. <source>Methods Ecol. Evol.</source> 14, <fpage>2049</fpage>&#x2013;<lpage>2064</lpage>. doi: <pub-id pub-id-type="doi">10.1111/2041-210X.14149</pub-id></citation></ref>
<ref id="ref45"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hempel</surname> <given-names>C. A.</given-names></name> <name><surname>Wright</surname> <given-names>N.</given-names></name> <name><surname>Harvie</surname> <given-names>J.</given-names></name> <name><surname>Hleap</surname> <given-names>J. S.</given-names></name> <name><surname>Adamowicz</surname> <given-names>S. J.</given-names></name> <name><surname>Steinke</surname> <given-names>D.</given-names></name></person-group> (<year>2022</year>). <article-title>Metagenomics versus total RNA sequencing: most accurate data-processing tools, microbial identification accuracy, and perspectives for freshwater assessments</article-title>. <source>Nucleic Acids Res.</source> <volume>50</volume>, <fpage>9279</fpage>&#x2013;<lpage>9293</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkac689</pub-id></citation></ref>
<ref id="ref46"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hermans</surname> <given-names>S. M.</given-names></name> <name><surname>Buckley</surname> <given-names>H. L.</given-names></name> <name><surname>Case</surname> <given-names>B. S.</given-names></name> <name><surname>Curran-Cournane</surname> <given-names>F.</given-names></name> <name><surname>Taylor</surname> <given-names>M.</given-names></name> <name><surname>Lear</surname> <given-names>G.</given-names></name></person-group> (<year>2020</year>). <article-title>Using soil bacterial communities to predict physico-chemical variables and soil quality</article-title>. <source>Microbiome</source> <volume>8</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. doi: <pub-id pub-id-type="doi">10.1186/s40168-020-00858-1</pub-id></citation></ref>
<ref id="ref47"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Joos</surname> <given-names>L.</given-names></name> <name><surname>Beirinckx</surname> <given-names>S.</given-names></name> <name><surname>Haegeman</surname> <given-names>A.</given-names></name> <name><surname>Debode</surname> <given-names>J.</given-names></name> <name><surname>Vandecasteele</surname> <given-names>B.</given-names></name> <name><surname>Baeyen</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Daring to be differential: metabarcoding analysis of soil and plant-related microbial communities using amplicon sequence variants and operational taxonomical units</article-title>. <source>BMC Genomics</source> <volume>21</volume>, <fpage>1</fpage>&#x2013;<lpage>17</lpage>. doi: <pub-id pub-id-type="doi">10.1186/s12864-020-07126-4</pub-id></citation></ref>
<ref id="ref48"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kang</surname> <given-names>W.</given-names></name> <name><surname>Anslan</surname> <given-names>S.</given-names></name> <name><surname>B&#x00F6;rner</surname> <given-names>N.</given-names></name> <name><surname>Schwarz</surname> <given-names>A.</given-names></name> <name><surname>Schmidt</surname> <given-names>R.</given-names></name> <name><surname>K&#x00FC;nzel</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Diatom metabarcoding and microscopic analyses from sediment samples at Lake Nam co, Tibet: the effect of sample-size and bioinformatics on the identified communities</article-title>. <source>Ecol. Indic.</source> <volume>121</volume>:<fpage>7070</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ecolind.2020.107070</pub-id></citation></ref>
<ref id="ref49"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kask</surname> <given-names>O.</given-names></name> <name><surname>Kyman</surname> <given-names>S.</given-names></name> <name><surname>Conn</surname> <given-names>K. A.</given-names></name> <name><surname>Gormley</surname> <given-names>J.</given-names></name> <name><surname>Gardner</surname> <given-names>J.</given-names></name> <name><surname>Johns</surname> <given-names>R. A.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Environmental exposures influence nasal microbiome composition in a longitudinal study of division I collegiate athletes</article-title>. <source>BioRxiv</source> <volume>2020</volume>:<fpage>946475</fpage>. doi: <pub-id pub-id-type="doi">10.1101/2020.02.13.946475</pub-id></citation></ref>
<ref id="ref50"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Khachatryan</surname> <given-names>L.</given-names></name> <name><surname>De Leeuw</surname> <given-names>R. H.</given-names></name> <name><surname>Kraakman</surname> <given-names>M. E. M.</given-names></name> <name><surname>Pappas</surname> <given-names>N.</given-names></name> <name><surname>Te Raa</surname> <given-names>M.</given-names></name> <name><surname>Mei</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Taxonomic classification and abundance estimation using 16S and WGS&#x2013;A comparison using controlled reference samples</article-title>. <source>Forensic Sci. Int. Genet.</source> <volume>46</volume>:<fpage>102257</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.fsigen.2020.102257</pub-id>, PMID: <pub-id pub-id-type="pmid">32058299</pub-id></citation></ref>
<ref id="ref51"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Knight</surname> <given-names>R.</given-names></name> <name><surname>Vrbanac</surname> <given-names>A.</given-names></name> <name><surname>Taylor</surname> <given-names>B. C.</given-names></name> <name><surname>Aksenov</surname> <given-names>A.</given-names></name> <name><surname>Callewaert</surname> <given-names>C.</given-names></name> <name><surname>Debelius</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Best practices for analysing microbiomes</article-title>. <source>Nat. Rev. Microbiol.</source> <volume>16</volume>, <fpage>410</fpage>&#x2013;<lpage>422</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41579-018-0029-9</pub-id>, PMID: <pub-id pub-id-type="pmid">29795328</pub-id></citation></ref>
<ref id="ref52"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Knowles</surname> <given-names>S. C. L.</given-names></name> <name><surname>Eccles</surname> <given-names>R. M.</given-names></name> <name><surname>Baltr&#x016B;nait&#x0117;</surname> <given-names>L.</given-names></name></person-group> (<year>2019</year>). <article-title>Species identity dominates over environment in shaping the microbiota of small mammals</article-title>. <source>Ecol. Lett.</source> <volume>22</volume>, <fpage>826</fpage>&#x2013;<lpage>837</lpage>. doi: <pub-id pub-id-type="doi">10.1111/ele.13240</pub-id>, PMID: <pub-id pub-id-type="pmid">30868708</pub-id></citation></ref>
<ref id="ref53"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Koh</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Zhan</surname> <given-names>X.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Zhao</surname> <given-names>N.</given-names></name></person-group> (<year>2019</year>). <article-title>A distance-based kernel association test based on the generalized linear mixed model for correlated microbiome studies</article-title>. <source>Front. Genet.</source> <volume>10</volume>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. doi: <pub-id pub-id-type="doi">10.3389/fgene.2019.00458</pub-id></citation></ref>
<ref id="ref54"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kubiszewski</surname> <given-names>I.</given-names></name> <name><surname>Costanza</surname> <given-names>R.</given-names></name> <name><surname>Anderson</surname> <given-names>S.</given-names></name> <name><surname>Sutton</surname> <given-names>P.</given-names></name></person-group> (<year>2017</year>). <article-title>The future value of ecosystem services: global scenarios and national implications</article-title>. <source>Ecosyst. Serv.</source> <volume>26</volume>, <fpage>289</fpage>&#x2013;<lpage>301</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ecoser.2017.05.004</pub-id></citation></ref>
<ref id="ref55"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lanz&#x00E9;n</surname> <given-names>A.</given-names></name> <name><surname>Mendibil</surname> <given-names>I.</given-names></name> <name><surname>Borja</surname> <given-names>A.</given-names></name> <name><surname>Saez</surname> <given-names>L. A.</given-names></name></person-group> (<year>2020</year>). <article-title>A microbial mandala for environmental monitoring &#x2013; predicting multiple impacts on estuarine prokaryote communities of the Bay of Biscay</article-title>. <source>Mol. Ecol.</source> <volume>30</volume>, <fpage>2969</fpage>&#x2013;<lpage>2987</lpage>. doi: <pub-id pub-id-type="doi">10.1111/mec.15489</pub-id></citation></ref>
<ref id="ref56"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Laudadio</surname> <given-names>I.</given-names></name> <name><surname>Fulci</surname> <given-names>V.</given-names></name> <name><surname>Palone</surname> <given-names>F.</given-names></name> <name><surname>Stronati</surname> <given-names>L.</given-names></name> <name><surname>Cucchiara</surname> <given-names>S.</given-names></name> <name><surname>Carissimi</surname> <given-names>C.</given-names></name></person-group> (<year>2018</year>). <article-title>Quantitative assessment of shotgun metagenomics and 16S rDNA amplicon sequencing in the study of human gut microbiome</article-title>. <source>Omi. A J. Integr. Biol.</source> <volume>22</volume>, <fpage>248</fpage>&#x2013;<lpage>254</lpage>. doi: <pub-id pub-id-type="doi">10.1089/omi.2018.0013</pub-id></citation></ref>
<ref id="ref57"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Laursen</surname> <given-names>M. F.</given-names></name> <name><surname>Dalgaard</surname> <given-names>M. D.</given-names></name> <name><surname>Bahl</surname> <given-names>M. I.</given-names></name></person-group> (<year>2017</year>). <article-title>Genomic GC-content affects the accuracy of 16S rRNA gene sequencing bsed microbial profiling due to PCR bias</article-title>. <source>Front. Microbiol.</source> <volume>8</volume>, <fpage>1</fpage>&#x2013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.3389/fmicb.2017.01934</pub-id></citation></ref>
<ref id="ref58"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Leese</surname> <given-names>F.</given-names></name> <name><surname>Bouchez</surname> <given-names>A.</given-names></name> <name><surname>Abarenkov</surname> <given-names>K.</given-names></name> <name><surname>Altermatt</surname> <given-names>F.</given-names></name> <name><surname>Borja</surname> <given-names>&#x00C1;.</given-names></name> <name><surname>Bruce</surname> <given-names>K.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Why we need sustainable networks bridging countries, disciplines, cultures and generations for aquatic biomonitoring 2.0: a perspective derived from the DNAqua-net COST action</article-title>. <source>Adv. Ecol. Res.</source> <volume>58</volume>, <fpage>63</fpage>&#x2013;<lpage>99</lpage>. doi: <pub-id pub-id-type="doi">10.1016/bs.aecr.2018.01.001</pub-id></citation></ref>
<ref id="ref59"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>H.</given-names></name> <name><surname>Durbin</surname> <given-names>R.</given-names></name></person-group> (<year>2009</year>). <article-title>Fast and accurate short read alignment with burrows-wheeler transform</article-title>. <source>Bioinformatics</source> <volume>25</volume>, <fpage>1754</fpage>&#x2013;<lpage>1760</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btp324</pub-id>, PMID: <pub-id pub-id-type="pmid">19451168</pub-id></citation></ref>
<ref id="ref60"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>F.</given-names></name> <name><surname>Guan</surname> <given-names>L. L.</given-names></name></person-group> (<year>2017</year>). <article-title>Metatranscriptomic profiling reveals linkages between the active rumen microbiome and feed efficiency in beef cattle</article-title>. <source>Appl. Environ. Microbiol.</source> <volume>83</volume>, <fpage>1</fpage>&#x2013;<lpage>16</lpage>. doi: <pub-id pub-id-type="doi">10.1128/AEM.00061-17</pub-id></citation></ref>
<ref id="ref61"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>H.</given-names></name> <name><surname>Handsaker</surname> <given-names>B.</given-names></name> <name><surname>Wysoker</surname> <given-names>A.</given-names></name> <name><surname>Fennell</surname> <given-names>T.</given-names></name> <name><surname>Ruan</surname> <given-names>J.</given-names></name> <name><surname>Homer</surname> <given-names>N.</given-names></name> <etal/></person-group>. (<year>2009</year>). <article-title>The sequence alignment/map format and SAMtools</article-title>. <source>Bioinformatics</source> <volume>25</volume>, <fpage>2078</fpage>&#x2013;<lpage>2079</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btp352</pub-id>, PMID: <pub-id pub-id-type="pmid">19505943</pub-id></citation></ref>
<ref id="ref62"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>F.</given-names></name> <name><surname>Henderson</surname> <given-names>G.</given-names></name> <name><surname>Sun</surname> <given-names>X.</given-names></name> <name><surname>Cox</surname> <given-names>F.</given-names></name> <name><surname>Janssen</surname> <given-names>P. H.</given-names></name> <name><surname>Guan</surname> <given-names>L. L.</given-names></name></person-group> (<year>2016</year>). <article-title>Taxonomic assessment of rumen microbiota using total RNA and targeted amplicon sequencing approaches</article-title>. <source>Front. Microbiol.</source> <volume>7</volume>:<fpage>987</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fmicb.2016.00987</pub-id>, PMID: <pub-id pub-id-type="pmid">27446027</pub-id></citation></ref>
<ref id="ref63"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>D.</given-names></name> <name><surname>Liu</surname> <given-names>C. M.</given-names></name> <name><surname>Luo</surname> <given-names>R.</given-names></name> <name><surname>Sadakane</surname> <given-names>K.</given-names></name> <name><surname>Lam</surname> <given-names>T. W.</given-names></name></person-group> (<year>2015</year>). <article-title>MEGAHIT: an ultra-fast single-node solution for large and complex metagenomics assembly via succinct de Bruijn graph</article-title>. <source>Bioinformatics</source> <volume>31</volume>, <fpage>1674</fpage>&#x2013;<lpage>1676</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btv033</pub-id>, PMID: <pub-id pub-id-type="pmid">25609793</pub-id></citation></ref>
<ref id="ref64"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Logares</surname> <given-names>R.</given-names></name> <name><surname>Sunagawa</surname> <given-names>S.</given-names></name> <name><surname>Salazar</surname> <given-names>G.</given-names></name> <name><surname>Cornejo-Castillo</surname> <given-names>F. M.</given-names></name> <name><surname>Ferrera</surname> <given-names>I.</given-names></name> <name><surname>Sarmento</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>Metagenomic 16S rDNA Illumina tags are a powerful alternative to amplicon sequencing to explore diversity and structure of microbial communities</article-title>. <source>Environ. Microbiol.</source> <volume>16</volume>, <fpage>2659</fpage>&#x2013;<lpage>2671</lpage>. doi: <pub-id pub-id-type="doi">10.1111/1462-2920.12250</pub-id>, PMID: <pub-id pub-id-type="pmid">24102695</pub-id></citation></ref>
<ref id="ref65"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lozupone</surname> <given-names>C. A.</given-names></name> <name><surname>Stombaugh</surname> <given-names>J.</given-names></name> <name><surname>Gonzalez</surname> <given-names>A.</given-names></name> <name><surname>Ackermann</surname> <given-names>G.</given-names></name> <name><surname>Wendel</surname> <given-names>D.</given-names></name> <name><surname>V&#x00E1;zquez-Baeza</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2013</year>). <article-title>Meta-analyses of studies of the human microbiota</article-title>. <source>Genome Res.</source> <volume>23</volume>, <fpage>1704</fpage>&#x2013;<lpage>1714</lpage>. doi: <pub-id pub-id-type="doi">10.1101/gr.151803.112</pub-id>, PMID: <pub-id pub-id-type="pmid">23861384</pub-id></citation></ref>
<ref id="ref66"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Mack</surname> <given-names>L.</given-names></name> <name><surname>Buchner</surname> <given-names>D.</given-names></name> <name><surname>Brasseur</surname> <given-names>M. V.</given-names></name> <name><surname>Leese</surname> <given-names>F.</given-names></name> <name><surname>Piggott</surname> <given-names>J. J.</given-names></name> <name><surname>Tiegs</surname> <given-names>S. D.</given-names></name> <etal/></person-group>. (<year>2022</year>). <italic>Fine sediment and the insecticide chlorantraniliprole inhibit organic matter decomposition in streams through different pathways</italic>. Freshw, Biol.</citation></ref>
<ref id="ref67"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Marcos-Zambrano</surname> <given-names>L. J.</given-names></name> <name><surname>Karaduzovic-Hadziabdic</surname> <given-names>K.</given-names></name> <name><surname>Loncar Turukalo</surname> <given-names>T.</given-names></name> <name><surname>Przymus</surname> <given-names>P.</given-names></name> <name><surname>Trajkovik</surname> <given-names>V.</given-names></name> <name><surname>Aasmets</surname> <given-names>O.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Applications of machine learning in human microbiome studies: a review on feature selection, biomarker identification, Disease Prediction and Treatment</article-title>. <source>Front. Microbiol.</source> <volume>12</volume>:<fpage>4511</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fmicb.2021.634511</pub-id>, PMID: <pub-id pub-id-type="pmid">33737920</pub-id></citation></ref>
<ref id="ref68"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Martin</surname> <given-names>M.</given-names></name></person-group> (<year>2011</year>). <article-title>Cutadapt removes adapter sequences from high-throughput sequencing reads</article-title>. <source>EMBnet J.</source> <volume>17</volume>, <fpage>10</fpage>&#x2013;<lpage>12</lpage>. doi: <pub-id pub-id-type="doi">10.14806/ej.17.1.200</pub-id>, PMID: <pub-id pub-id-type="pmid">28715235</pub-id></citation></ref>
<ref id="ref69"><citation citation-type="other"><person-group person-group-type="author"><name><surname>McLaren</surname> <given-names>M. R.</given-names></name> <name><surname>Callahan</surname> <given-names>B. J.</given-names></name></person-group> (<year>2021</year>). <italic>Silva 138.1 prokaryotic SSU taxonomic training data formatted for DADA2</italic>.</citation></ref>
<ref id="ref70"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Meisel</surname> <given-names>J. S.</given-names></name> <name><surname>Hannigan</surname> <given-names>G. D.</given-names></name> <name><surname>Tyldsley</surname> <given-names>A. S.</given-names></name> <name><surname>SanMiguel</surname> <given-names>A. J.</given-names></name> <name><surname>Hodkinson</surname> <given-names>B. P.</given-names></name> <name><surname>Zheng</surname> <given-names>Q.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>Skin microbiome surveys are strongly influenced by experimental design</article-title>. <source>J. Invest. Dermatol.</source> <volume>136</volume>, <fpage>947</fpage>&#x2013;<lpage>956</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jid.2016.01.016</pub-id>, PMID: <pub-id pub-id-type="pmid">26829039</pub-id></citation></ref>
<ref id="ref71"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mirtl</surname> <given-names>M.</given-names></name> <name><surname>Borer</surname> <given-names>E. T.</given-names></name> <name><surname>Djukic</surname> <given-names>I.</given-names></name> <name><surname>Forsius</surname> <given-names>M.</given-names></name> <name><surname>Haubold</surname> <given-names>H.</given-names></name> <name><surname>Hugo</surname> <given-names>W.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Genesis, goals and achievements of long-term ecological research at the global scale: a critical review of ILTER and future directions</article-title>. <source>Sci. Total Environ.</source> <volume>626</volume>, <fpage>1439</fpage>&#x2013;<lpage>1462</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.scitotenv.2017.12.001</pub-id></citation></ref>
<ref id="ref72"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Muletz Wolz</surname> <given-names>C. R.</given-names></name> <name><surname>Yarwood</surname> <given-names>S. A.</given-names></name> <name><surname>Campbell Grant</surname> <given-names>E. H.</given-names></name> <name><surname>Fleischer</surname> <given-names>R. C.</given-names></name> <name><surname>Lips</surname> <given-names>K. R.</given-names></name></person-group> (<year>2018</year>). <article-title>Effects of host species and environment on the skin microbiome of plethodontid salamanders</article-title>. <source>J. Anim. Ecol.</source> <volume>87</volume>, <fpage>341</fpage>&#x2013;<lpage>353</lpage>. doi: <pub-id pub-id-type="doi">10.1111/1365-2656.12726</pub-id>, PMID: <pub-id pub-id-type="pmid">28682480</pub-id></citation></ref>
<ref id="ref73"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Obiol</surname> <given-names>A.</given-names></name> <name><surname>Giner</surname> <given-names>C. R.</given-names></name> <name><surname>S&#x00E1;nchez</surname> <given-names>P.</given-names></name> <name><surname>Duarte</surname> <given-names>C. M.</given-names></name> <name><surname>Acinas</surname> <given-names>S. G.</given-names></name> <name><surname>Massana</surname> <given-names>R.</given-names></name></person-group> (<year>2020</year>). <article-title>A metagenomic assessment of microbial eukaryotic diversity in the global ocean</article-title>. <source>Mol. Ecol. Resour.</source> <volume>20</volume>, <fpage>718</fpage>&#x2013;<lpage>731</lpage>. doi: <pub-id pub-id-type="doi">10.1111/1755-0998.13147</pub-id>, PMID: <pub-id pub-id-type="pmid">37578240</pub-id></citation></ref>
<ref id="ref74"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Oudah</surname> <given-names>M.</given-names></name> <name><surname>Henschel</surname> <given-names>A.</given-names></name></person-group> (<year>2018</year>). <article-title>Taxonomy-aware feature engineering for microbiome classification</article-title>. <source>BMC Bioinformatics</source> <volume>19</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. doi: <pub-id pub-id-type="doi">10.1186/s12859-018-2205-3</pub-id></citation></ref>
<ref id="ref75"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Parks</surname> <given-names>D. H.</given-names></name> <name><surname>Imelfort</surname> <given-names>M.</given-names></name> <name><surname>Skennerton</surname> <given-names>C. T.</given-names></name> <name><surname>Hugenholtz</surname> <given-names>P.</given-names></name> <name><surname>Tyson</surname> <given-names>G. W.</given-names></name></person-group> (<year>2015</year>). <article-title>CheckM: assessing the quality of microbial genomes recovered from isolates, single cells, and metagenomes</article-title>. <source>Genome Res.</source> <volume>25</volume>, <fpage>1043</fpage>&#x2013;<lpage>1055</lpage>. doi: <pub-id pub-id-type="doi">10.1101/gr.186072.114</pub-id>, PMID: <pub-id pub-id-type="pmid">25977477</pub-id></citation></ref>
<ref id="ref76"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pawlowski</surname> <given-names>J.</given-names></name> <name><surname>Lejzerowicz</surname> <given-names>F.</given-names></name> <name><surname>Apotheloz-Perret-Gentil</surname> <given-names>L.</given-names></name> <name><surname>Visco</surname> <given-names>J. A.</given-names></name> <name><surname>Esling</surname> <given-names>P.</given-names></name></person-group> (<year>2016</year>). <article-title>Protist metabarcoding and environmental biomonitoring: time for change</article-title>. <source>Eur. J. Protistol.</source> <volume>55</volume>, <fpage>12</fpage>&#x2013;<lpage>25</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ejop.2016.02.003</pub-id>, PMID: <pub-id pub-id-type="pmid">27004417</pub-id></citation></ref>
<ref id="ref77"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pedregosa</surname> <given-names>F.</given-names></name> <name><surname>Varoquaux</surname> <given-names>G.</given-names></name> <name><surname>Gramfort</surname> <given-names>A.</given-names></name> <name><surname>Michel</surname> <given-names>V.</given-names></name> <name><surname>Thirion</surname> <given-names>B.</given-names></name> <name><surname>Grisel</surname> <given-names>O.</given-names></name> <etal/></person-group>. (<year>2011</year>). <article-title>Scikit-learn: machine learning in Python</article-title>. <source>J. Mach. Learn. Res.</source> <volume>12</volume>, <fpage>2825</fpage>&#x2013;<lpage>2830</lpage>.</citation></ref>
<ref id="ref78"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pereira</surname> <given-names>M. B.</given-names></name> <name><surname>Wallroth</surname> <given-names>M.</given-names></name> <name><surname>Jonsson</surname> <given-names>V.</given-names></name> <name><surname>Kristiansson</surname> <given-names>E.</given-names></name></person-group> (<year>2018</year>). <article-title>Comparison of normalization methods for the analysis of metagenomic gene abundance data</article-title>. <source>BMC Genomics</source> <volume>19</volume>, <fpage>1</fpage>&#x2013;<lpage>17</lpage>. doi: <pub-id pub-id-type="doi">10.1186/s12864-018-4637-6</pub-id></citation></ref>
<ref id="ref79"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pettorelli</surname> <given-names>N.</given-names></name> <name><surname>Graham</surname> <given-names>N. A. J.</given-names></name> <name><surname>Seddon</surname> <given-names>N.</given-names></name> <name><surname>Da Cunha</surname> <given-names>M.</given-names></name> <name><surname>Bustamante</surname> <given-names>M.</given-names></name> <name><surname>Lowton</surname> <given-names>M. J.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Time to integrate global climate change and biodiversity science-policy agendas</article-title>. <source>J. Appl. Ecol.</source> <volume>58</volume>, <fpage>2384</fpage>&#x2013;<lpage>2393</lpage>. doi: <pub-id pub-id-type="doi">10.1111/1365-2664.13985</pub-id></citation></ref>
<ref id="ref80"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Philippot</surname> <given-names>L.</given-names></name> <name><surname>Andersson</surname> <given-names>S. G. E.</given-names></name> <name><surname>Battin</surname> <given-names>T. J.</given-names></name> <name><surname>Prosser</surname> <given-names>J. I.</given-names></name> <name><surname>Schimel</surname> <given-names>J. P.</given-names></name> <name><surname>Whitman</surname> <given-names>W. B.</given-names></name> <etal/></person-group>. (<year>2010</year>). <article-title>The ecological coherence of high bacterial taxonomic ranks</article-title>. <source>Nat. Rev. Microbiol.</source> <volume>8</volume>, <fpage>523</fpage>&#x2013;<lpage>529</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nrmicro2367</pub-id>, PMID: <pub-id pub-id-type="pmid">20531276</pub-id></citation></ref>
<ref id="ref81"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Piggott</surname> <given-names>J. J.</given-names></name> <name><surname>Salis</surname> <given-names>R. K.</given-names></name> <name><surname>Lear</surname> <given-names>G.</given-names></name> <name><surname>Townsend</surname> <given-names>C. R.</given-names></name> <name><surname>Matthaei</surname> <given-names>C. D.</given-names></name></person-group> (<year>2015</year>). <article-title>Climate warming and agricultural stressors interact to determine stream periphyton community composition</article-title>. <source>Glob. Chang. Biol.</source> <volume>21</volume>, <fpage>206</fpage>&#x2013;<lpage>222</lpage>. doi: <pub-id pub-id-type="doi">10.1111/gcb.12661</pub-id>, PMID: <pub-id pub-id-type="pmid">24942814</pub-id></citation></ref>
<ref id="ref82"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pinto</surname> <given-names>A. J.</given-names></name> <name><surname>Raskin</surname> <given-names>L.</given-names></name></person-group> (<year>2012</year>). <article-title>PCR biases distort bacterial and archaeal community structure in pyrosequencing datasets</article-title>. <source>PLoS One</source> <volume>7</volume>:<fpage>3093</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0043093</pub-id>, PMID: <pub-id pub-id-type="pmid">22905208</pub-id></citation></ref>
<ref id="ref83"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Quast</surname> <given-names>C.</given-names></name> <name><surname>Pruesse</surname> <given-names>E.</given-names></name> <name><surname>Yilmaz</surname> <given-names>P.</given-names></name> <name><surname>Gerken</surname> <given-names>J.</given-names></name> <name><surname>Schweer</surname> <given-names>T.</given-names></name> <name><surname>Yarza</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2013</year>). <article-title>The SILVA ribosomal RNA gene database project: improved data processing and web-based tools</article-title>. <source>Nucleic Acids Res.</source> <volume>41</volume>, <fpage>590</fpage>&#x2013;<lpage>596</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gks1219</pub-id></citation></ref>
<ref id="ref84"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Quince</surname> <given-names>C.</given-names></name> <name><surname>Walker</surname> <given-names>A. W.</given-names></name> <name><surname>Simpson</surname> <given-names>J. T.</given-names></name> <name><surname>Loman</surname> <given-names>N. J.</given-names></name> <name><surname>Segata</surname> <given-names>N.</given-names></name></person-group> (<year>2017</year>). <article-title>Shotgun metagenomics, from sampling to analysis</article-title>. <source>Nat. Biotechnol.</source> <volume>35</volume>, <fpage>833</fpage>&#x2013;<lpage>844</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nbt.3935</pub-id>, PMID: <pub-id pub-id-type="pmid">28898207</pub-id></citation></ref>
<ref id="ref85"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Reback</surname> <given-names>J.</given-names></name> <name><surname>McKinney</surname> <given-names>W. J.</given-names></name> <name><surname>Van Den Bossche</surname> <given-names>J.</given-names></name> <name><surname>Augspurger</surname> <given-names>T.</given-names></name> <name><surname>Cloud</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2021</year>). <italic>Pandas-dev/pandas: Pandas 1.3.5</italic>.</citation></ref>
<ref id="ref86"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rognes</surname> <given-names>T.</given-names></name> <name><surname>Flouri</surname> <given-names>T.</given-names></name> <name><surname>Nichols</surname> <given-names>B.</given-names></name> <name><surname>Quince</surname> <given-names>C.</given-names></name> <name><surname>Mah&#x00E9;</surname> <given-names>F.</given-names></name></person-group> (<year>2016</year>). <article-title>VSEARCH: a versatile open source tool for metagenomics</article-title>. <source>Peer J</source> <volume>2016</volume>, <fpage>1</fpage>&#x2013;<lpage>22</lpage>. doi: <pub-id pub-id-type="doi">10.7717/peerj.2584</pub-id></citation></ref>
<ref id="ref87"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Roy</surname> <given-names>J.</given-names></name> <name><surname>Mazel</surname> <given-names>F.</given-names></name> <name><surname>Sosa-Hern&#x00E1;ndez</surname> <given-names>M. A.</given-names></name> <name><surname>Due&#x00F1;as</surname> <given-names>J. F.</given-names></name> <name><surname>Hempel</surname> <given-names>S.</given-names></name> <name><surname>Zinger</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>The relative importance of ecological drivers of arbuscular mycorrhizal fungal distribution varies with taxon phylogenetic resolution</article-title>. <source>New Phytol.</source> <volume>224</volume>, <fpage>936</fpage>&#x2013;<lpage>948</lpage>. doi: <pub-id pub-id-type="doi">10.1111/nph.16080</pub-id>, PMID: <pub-id pub-id-type="pmid">31355954</pub-id></citation></ref>
<ref id="ref88"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sagova-Mareckova</surname> <given-names>M.</given-names></name> <name><surname>Boenigk</surname> <given-names>J.</given-names></name> <name><surname>Bouchez</surname> <given-names>A.</given-names></name> <name><surname>Cermakova</surname> <given-names>K.</given-names></name> <name><surname>Chonova</surname> <given-names>T.</given-names></name> <name><surname>Cordier</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Expanding ecological assessment by integrating microorganisms into routine freshwater biomonitoring</article-title>. <source>Water Res.</source> <volume>191</volume>:<fpage>116767</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.watres.2020.116767</pub-id>, PMID: <pub-id pub-id-type="pmid">33418487</pub-id></citation></ref>
<ref id="ref89"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shah</surname> <given-names>N.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Doak</surname> <given-names>T. G.</given-names></name> <name><surname>Ye</surname> <given-names>Y.</given-names></name></person-group> (<year>2010</year>). <article-title>Comparing bacterial communities inferred from 16S rRNA gene sequencing and shotgun metagenomics</article-title>. <source>Pac. Symp. Biocomput.</source> <volume>2011</volume>, <fpage>165</fpage>&#x2013;<lpage>176</lpage>. doi: <pub-id pub-id-type="doi">10.1142/9789814335058_0018</pub-id>, PMID: <pub-id pub-id-type="pmid">21121044</pub-id></citation></ref>
<ref id="ref90"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shakya</surname> <given-names>M.</given-names></name> <name><surname>Lo</surname> <given-names>C. C.</given-names></name> <name><surname>Chain</surname> <given-names>P. S. G.</given-names></name></person-group> (<year>2019</year>). <article-title>Advances and challenges in metatranscriptomic analysis</article-title>. <source>Front. Genet.</source> <volume>10</volume>, <fpage>1</fpage>&#x2013;<lpage>10</lpage>. doi: <pub-id pub-id-type="doi">10.3389/fgene.2019.00904</pub-id></citation></ref>
<ref id="ref91"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shakya</surname> <given-names>M.</given-names></name> <name><surname>Quince</surname> <given-names>C.</given-names></name> <name><surname>Campbell</surname> <given-names>J. H.</given-names></name> <name><surname>Yang</surname> <given-names>Z. K.</given-names></name> <name><surname>Schadt</surname> <given-names>C. W.</given-names></name> <name><surname>Podar</surname> <given-names>M.</given-names></name></person-group> (<year>2013</year>). <article-title>Comparative metagenomic and rRNA microbial diversity characterization using archaeal and bacterial synthetic communities</article-title>. <source>Environ. Microbiol.</source> <volume>15</volume>, <fpage>1882</fpage>&#x2013;<lpage>1899</lpage>. doi: <pub-id pub-id-type="doi">10.1111/1462-2920.12086</pub-id>, PMID: <pub-id pub-id-type="pmid">23387867</pub-id></citation></ref>
<ref id="ref92"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Smith</surname> <given-names>M. B.</given-names></name> <name><surname>Rocha</surname> <given-names>A. M.</given-names></name> <name><surname>Smillie</surname> <given-names>C. S.</given-names></name> <name><surname>Olesen</surname> <given-names>S. W.</given-names></name> <name><surname>Paradis</surname> <given-names>C.</given-names></name> <name><surname>Wu</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Natural bacterial communities serve as quantitative geochemical biosensors</article-title>. <source>MBio</source> <volume>6</volume>, <fpage>e00326</fpage>&#x2013;<lpage>e00315</lpage>. doi: <pub-id pub-id-type="doi">10.1128/mBio.00326-15</pub-id>, PMID: <pub-id pub-id-type="pmid">25968645</pub-id></citation></ref>
<ref id="ref93"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stat</surname> <given-names>M.</given-names></name> <name><surname>Huggett</surname> <given-names>M. J.</given-names></name> <name><surname>Bernasconi</surname> <given-names>R.</given-names></name> <name><surname>Dibattista</surname> <given-names>J. D.</given-names></name> <name><surname>Berry</surname> <given-names>T. E.</given-names></name> <name><surname>Newman</surname> <given-names>S. J.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Ecosystem biomonitoring with eDNA: Metabarcoding across the tree of life in a tropical marine environment</article-title>. <source>Sci. Rep.</source> <volume>7</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-017-12501-5</pub-id></citation></ref>
<ref id="ref94"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tapolczai</surname> <given-names>K.</given-names></name> <name><surname>Keck</surname> <given-names>F.</given-names></name> <name><surname>Bouchez</surname> <given-names>A.</given-names></name> <name><surname>Rimet</surname> <given-names>F.</given-names></name> <name><surname>Kahlert</surname> <given-names>M.</given-names></name> <name><surname>Vasselon</surname> <given-names>V.</given-names></name></person-group> (<year>2019</year>). <article-title>Diatom DNA Metabarcoding for biomonitoring: strategies to avoid major taxonomical and Bioinformatical biases limiting molecular indices capacities</article-title>. <source>Front. Ecol. Evol.</source> <volume>7</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. doi: <pub-id pub-id-type="doi">10.3389/fevo.2019.00409</pub-id></citation></ref>
<ref id="ref95"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tavalire</surname> <given-names>H. F.</given-names></name> <name><surname>Christie</surname> <given-names>D. M.</given-names></name> <name><surname>Leve</surname> <given-names>L. D.</given-names></name> <name><surname>Ting</surname> <given-names>N.</given-names></name> <name><surname>Cresko</surname> <given-names>W. A.</given-names></name> <name><surname>Bohannan</surname> <given-names>B. J. M.</given-names></name></person-group> (<year>2021</year>). <article-title>Shared environment and genetics shape the gut microbiome after infant adoption</article-title>. <source>MBio</source> <volume>12</volume>:<fpage>548</fpage>. doi: <pub-id pub-id-type="doi">10.1128/mBio.00548-21</pub-id>, PMID: <pub-id pub-id-type="pmid">33785620</pub-id></citation></ref>
<ref id="ref96"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tedersoo</surname> <given-names>L.</given-names></name> <name><surname>Bahram</surname> <given-names>M.</given-names></name> <name><surname>Zinger</surname> <given-names>L.</given-names></name> <name><surname>Nilsson</surname> <given-names>R. H.</given-names></name> <name><surname>Kennedy</surname> <given-names>P. G.</given-names></name> <name><surname>Yang</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Best practices in metabarcoding of fungi: from experimental design to results</article-title>. <source>Mol. Ecol.</source> <volume>31</volume>, <fpage>2769</fpage>&#x2013;<lpage>2795</lpage>. doi: <pub-id pub-id-type="doi">10.1111/mec.16460</pub-id>, PMID: <pub-id pub-id-type="pmid">35395127</pub-id></citation></ref>
<ref id="ref97"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tessler</surname> <given-names>M.</given-names></name> <name><surname>Neumann</surname> <given-names>J. S.</given-names></name> <name><surname>Afshinnekoo</surname> <given-names>E.</given-names></name> <name><surname>Pineda</surname> <given-names>M.</given-names></name> <name><surname>Hersch</surname> <given-names>R.</given-names></name> <name><surname>Velho</surname> <given-names>L. F. M.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Large-scale differences in microbial biodiversity discovery between 16S amplicon and shotgun sequencing</article-title>. <source>Sci. Rep.</source> <volume>7</volume>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-017-06665-3</pub-id></citation></ref>
<ref id="ref98"><citation citation-type="other"><person-group person-group-type="author"><collab id="coll1">The Scikit-Bio Development Team</collab></person-group>. (<year>2020</year>). <italic>Scikit-bio: A bioinformatics library for data scientists, students, and developers</italic>. Available at: <ext-link xlink:href="http://scikit-bio.org" ext-link-type="uri">http://scikit-bio.org</ext-link>.</citation></ref>
<ref id="ref99"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Turner</surname> <given-names>T. R.</given-names></name> <name><surname>Ramakrishnan</surname> <given-names>K.</given-names></name> <name><surname>Walshaw</surname> <given-names>J.</given-names></name> <name><surname>Heavens</surname> <given-names>D.</given-names></name> <name><surname>Alston</surname> <given-names>M.</given-names></name> <name><surname>Swarbreck</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2013</year>). <article-title>Comparative metatranscriptomics reveals kingdom level changes in the rhizosphere microbiome of plants</article-title>. <source>ISME J.</source> <volume>7</volume>, <fpage>2248</fpage>&#x2013;<lpage>2258</lpage>. doi: <pub-id pub-id-type="doi">10.1038/ismej.2013.119</pub-id>, PMID: <pub-id pub-id-type="pmid">23864127</pub-id></citation></ref>
<ref id="ref100"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Urich</surname> <given-names>T.</given-names></name> <name><surname>Lanz&#x00E9;n</surname> <given-names>A.</given-names></name> <name><surname>Qi</surname> <given-names>J.</given-names></name> <name><surname>Huson</surname> <given-names>D. H.</given-names></name> <name><surname>Schleper</surname> <given-names>C.</given-names></name> <name><surname>Schuster</surname> <given-names>S. C.</given-names></name></person-group> (<year>2008</year>). <article-title>Simultaneous assessment of soil microbial community structure and function through analysis of the meta-transcriptome</article-title>. <source>PLoS One</source> <volume>3</volume>:<fpage>e2527</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0002527</pub-id>, PMID: <pub-id pub-id-type="pmid">18575584</pub-id></citation></ref>
<ref id="ref101"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Uyaguari-Diaz</surname> <given-names>M. I.</given-names></name> <name><surname>Chan</surname> <given-names>M.</given-names></name> <name><surname>Chaban</surname> <given-names>B. L.</given-names></name> <name><surname>Croxen</surname> <given-names>M. A.</given-names></name> <name><surname>Finke</surname> <given-names>J. F.</given-names></name> <name><surname>Hill</surname> <given-names>J. E.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>A comprehensive method for amplicon-based and metagenomic characterization of viruses, bacteria, and eukaryotes in freshwater samples</article-title>. <source>Microbiome</source> <volume>4</volume>, <fpage>1</fpage>&#x2013;<lpage>19</lpage>. doi: <pub-id pub-id-type="doi">10.1186/s40168-016-0166-1</pub-id></citation></ref>
<ref id="ref102"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Van Rossum</surname> <given-names>G.</given-names></name> <name><surname>Drake</surname> <given-names>F. L.</given-names></name></person-group> (<year>2009</year>). <source>Python 3 reference manual</source>. <publisher-loc>Scotts Valley, CA</publisher-loc>: <publisher-name>CreateSpace</publisher-name>.</citation></ref>
<ref id="ref103"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vera-Gargallo</surname> <given-names>B.</given-names></name> <name><surname>Chowdhury</surname> <given-names>T. R.</given-names></name> <name><surname>Brown</surname> <given-names>J.</given-names></name> <name><surname>Fansler</surname> <given-names>S. J.</given-names></name> <name><surname>Dur&#x00E1;n-Viseras</surname> <given-names>A.</given-names></name> <name><surname>S&#x00E1;nchez-Porro</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Spatial distribution of prokaryotic communities in hypersaline soils</article-title>. <source>Sci. Rep.</source> <volume>9</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-018-38339-z</pub-id></citation></ref>
<ref id="ref104"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Virtanen</surname> <given-names>P.</given-names></name> <name><surname>Gommers</surname> <given-names>R.</given-names></name> <name><surname>Oliphant</surname> <given-names>T. E.</given-names></name> <name><surname>Haberland</surname> <given-names>M.</given-names></name> <name><surname>Reddy</surname> <given-names>T.</given-names></name> <name><surname>Cournapeau</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>SciPy 1.0: fundamental algorithms for scientific computing in Python</article-title>. <source>Nat. Methods</source> <volume>17</volume>, <fpage>261</fpage>&#x2013;<lpage>272</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41592-019-0686-2</pub-id>, PMID: <pub-id pub-id-type="pmid">32015543</pub-id></citation></ref>
<ref id="ref105"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Walker</surname> <given-names>A. W.</given-names></name> <name><surname>Martin</surname> <given-names>J. C.</given-names></name> <name><surname>Scott</surname> <given-names>P.</given-names></name> <name><surname>Parkhill</surname> <given-names>J.</given-names></name> <name><surname>Flint</surname> <given-names>H. J.</given-names></name> <name><surname>Scott</surname> <given-names>K. P.</given-names></name></person-group> (<year>2015</year>). <article-title>16S rRNA gene-based profiling of the human infant gut microbiota is strongly influenced by sample processing and PCR primer choice</article-title>. <source>Microbiome</source> <volume>3</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1186/s40168-015-0087-4</pub-id></citation></ref>
<ref id="ref106"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Weiss</surname> <given-names>S.</given-names></name> <name><surname>Xu</surname> <given-names>Z. Z.</given-names></name> <name><surname>Peddada</surname> <given-names>S.</given-names></name> <name><surname>Amir</surname> <given-names>A.</given-names></name> <name><surname>Bittinger</surname> <given-names>K.</given-names></name> <name><surname>Gonzalez</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Normalization and microbial differential abundance strategies depend upon data characteristics</article-title>. <source>Microbiome</source> <volume>5</volume>:<fpage>27</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s40168-017-0237-y</pub-id></citation></ref>
<ref id="ref107"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Witten</surname> <given-names>I. H.</given-names></name> <name><surname>Frank</surname> <given-names>E.</given-names></name></person-group> (<year>2005</year>). <source>Data mining: Practical machine learning tools and techniques</source>. <edition>2nd Edn.</edition> <publisher-loc>San Francisco</publisher-loc>: <publisher-name>Elsevier Inc.</publisher-name></citation></ref>
<ref id="ref108"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wood</surname> <given-names>D. E.</given-names></name> <name><surname>Lu</surname> <given-names>J.</given-names></name> <name><surname>Langmead</surname> <given-names>B.</given-names></name></person-group> (<year>2019</year>). <article-title>Improved metagenomic analysis with kraken 2</article-title>. <source>Genome Biol.</source> <volume>20</volume>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. doi: <pub-id pub-id-type="doi">10.1186/s13059-019-1891-0</pub-id></citation></ref>
<ref id="ref109"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wooley</surname> <given-names>J. C.</given-names></name> <name><surname>Godzik</surname> <given-names>A.</given-names></name> <name><surname>Friedberg</surname> <given-names>I.</given-names></name></person-group> (<year>2010</year>). <article-title>A primer on metagenomics</article-title>. <source>PLoS Comput. Biol.</source> <volume>6</volume>:<fpage>e1000667</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pcbi.1000667</pub-id>, PMID: <pub-id pub-id-type="pmid">20195499</pub-id></citation></ref>
<ref id="ref110"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>H.</given-names></name> <name><surname>Cai</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>D.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Zhao</surname> <given-names>S.</given-names></name> <name><surname>Zou</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Metagenomics biomarkers selected for prediction of three different diseases in Chinese population</article-title>. <source>Biomed. Res. Int.</source> <volume>2018</volume>:<fpage>36257</fpage>. doi: <pub-id pub-id-type="doi">10.1155/2018/2936257</pub-id>, PMID: <pub-id pub-id-type="pmid">29568746</pub-id></citation></ref>
<ref id="ref111"><citation citation-type="book"><person-group person-group-type="author"><collab id="coll2">WWF</collab></person-group>. (<year>2020</year>). <source>Living planet report 2020-bending the curve of biodiversity loss</source>. <publisher-loc>Gland, Switzerland</publisher-loc>: <publisher-name>WWF</publisher-name>.</citation></ref>
<ref id="ref112"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xue</surname> <given-names>Y.</given-names></name> <name><surname>Lanz&#x00E9;n</surname> <given-names>A.</given-names></name> <name><surname>Jonassen</surname> <given-names>I.</given-names></name></person-group> (<year>2020</year>). <article-title>Reconstructing ribosomal genes from large scale total RNA meta-transcriptomic data</article-title>. <source>Bioinformatics</source> <volume>36</volume>, <fpage>3365</fpage>&#x2013;<lpage>3371</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa177</pub-id>, PMID: <pub-id pub-id-type="pmid">32167532</pub-id></citation></ref>
<ref id="ref113"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yan</surname> <given-names>Y. W.</given-names></name> <name><surname>Jiang</surname> <given-names>Q. Y.</given-names></name> <name><surname>Wang</surname> <given-names>J. G.</given-names></name> <name><surname>Zhu</surname> <given-names>T.</given-names></name> <name><surname>Zou</surname> <given-names>B.</given-names></name> <name><surname>Qiu</surname> <given-names>Q. F.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Microbial communities and diversities in mudflat sediments analyzed using a modified metatranscriptomic method</article-title>. <source>Front. Microbiol.</source> <volume>9</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. doi: <pub-id pub-id-type="doi">10.3389/fmicb.2018.00093</pub-id></citation></ref>
<ref id="ref114"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yilmaz</surname> <given-names>P.</given-names></name> <name><surname>Kottmann</surname> <given-names>R.</given-names></name> <name><surname>Pruesse</surname> <given-names>E.</given-names></name> <name><surname>Quast</surname> <given-names>C.</given-names></name> <name><surname>Gl&#x00F6;ckner</surname> <given-names>F. O.</given-names></name></person-group> (<year>2011</year>). <article-title>Analysis of 23S rRNA genes in metagenomes - a case study from the Global Ocean sampling expedition</article-title>. <source>Syst. Appl. Microbiol.</source> <volume>34</volume>, <fpage>462</fpage>&#x2013;<lpage>469</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.syapm.2011.04.005</pub-id>, PMID: <pub-id pub-id-type="pmid">21676569</pub-id></citation></ref>
<ref id="ref115"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zizka</surname> <given-names>V. M. A.</given-names></name> <name><surname>Elbrecht</surname> <given-names>V.</given-names></name> <name><surname>Macher</surname> <given-names>J. N.</given-names></name> <name><surname>Leese</surname> <given-names>F.</given-names></name></person-group> (<year>2019</year>). <article-title>Assessing the influence of sample tagging and library preparation on DNA metabarcoding</article-title>. <source>Mol. Ecol. Resour.</source> <volume>19</volume>, <fpage>893</fpage>&#x2013;<lpage>899</lpage>. doi: <pub-id pub-id-type="doi">10.1111/1755-0998.13018</pub-id>, PMID: <pub-id pub-id-type="pmid">30963710</pub-id></citation></ref>
</ref-list>
</back>
</article>