<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="brief-report" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Big Data</journal-id>
<journal-title>Frontiers in Big Data</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Big Data</abbrev-journal-title>
<issn pub-type="epub">2624-909X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">725095</article-id>
<article-id pub-id-type="doi">10.3389/fdata.2021.725095</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Big Data</subject>
<subj-group>
<subject>Brief Research Report</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>NPARS&#x2014;A Novel Approach to Address Accuracy and Reproducibility in Genomic Data Science</article-title>
<alt-title alt-title-type="left-running-head">Ma et&#x20;al.</alt-title>
<alt-title alt-title-type="right-running-head">NSG Post-Pipeline Reproducibility System</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Ma</surname>
<given-names>Li</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1467895/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Peterson</surname>
<given-names>Erich A.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1323984/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Shin</surname>
<given-names>Ik Jae</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1460758/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Muesse</surname>
<given-names>Jason</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Marino</surname>
<given-names>Katy</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Steliga</surname>
<given-names>Matthew A.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Johann</surname>
<given-names>Donald J.</given-names>
<suffix>Jr</suffix>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1411901/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<label>
<sup>1</sup>
</label>Winthrop P. Rockefeller Cancer Institute, University of Arkansas for Medical Sciences, <addr-line>Little Rock</addr-line>, <addr-line>AR</addr-line>, <country>United&#x20;States</country>
</aff>
<aff id="aff2">
<label>
<sup>2</sup>
</label>Department of Information Science, University of Arkansas at Little Rock, <addr-line>Little Rock</addr-line>, <addr-line>AR</addr-line>, <country>United&#x20;States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/472005/overview">Huixiao Hong</ext-link>, United&#x20;States Food and Drug Administration, United&#x20;States</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/553583/overview">Jung Hun Oh</ext-link>, Memorial Sloan Kettering Cancer Center, United&#x20;States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1431063/overview">Sheeba Samuel</ext-link>, Friedrich Schiller University Jena, Germany</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Donald J.&#x20;Johann Jr, <email>djjohann@uams.edu</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Medicine and Public Health, a section of the journal Frontiers in Big&#x20;Data</p>
</fn>
<fn fn-type="equal" id="fn1">
<label>
<sup>&#x2020;</sup>
</label>
<p>These authors have contributed equally to this&#x20;work</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>27</day>
<month>09</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>4</volume>
<elocation-id>725095</elocation-id>
<history>
<date date-type="received">
<day>14</day>
<month>06</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>07</day>
<month>09</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2021 Ma, Peterson, Shin, Muesse, Marino, Steliga and Johann.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Ma, Peterson, Shin, Muesse, Marino, Steliga and Johann</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these&#x20;terms.</p>
</license>
</permissions>
<abstract>
<p>
<bold>Background:</bold> Accuracy and reproducibility are vital in science and presents a significant challenge in the emerging discipline of data science, especially when the data are scientifically complex and massive in size. Further complicating matters, in the field of genomic-based science high-throughput sequencing technologies generate considerable amounts of data that needs to be stored, manipulated, and analyzed using a plethora of software tools. Researchers are rarely able to reproduce published genomic studies.</p>
<p>
<bold>Results:</bold> Presented is a novel approach which facilitates accuracy and reproducibility for large genomic research data sets. All data needed is loaded into a portable local database, which serves as an interface for well-known software frameworks. These include python-based Jupyter Notebooks and the use of RStudio projects and R markdown. All software is encapsulated using Docker containers and managed by Git, simplifying software configuration management.</p>
<p>
<bold>Conclusion:</bold> Accuracy and reproducibility in science is of a paramount importance. For the biomedical sciences, advances in high throughput technologies, molecular biology and quantitative methods are providing unprecedented insights into disease mechanisms. With these insights come the associated challenge of scientific data that is complex and massive in size. This makes collaboration, verification, validation, and reproducibility of findings difficult. To address these challenges the NGS post-pipeline accuracy and reproducibility system (NPARS) was developed. NPARS is a robust software infrastructure and methodology that can encapsulate data, code, and reporting for large genomic studies. This paper demonstrates the successful use of NPARS on large and complex genomic data sets across different computational platforms.</p>
</abstract>
<kwd-group>
<kwd>genomics</kwd>
<kwd>data science</kwd>
<kwd>reproducibility</kwd>
<kwd>accuracy</kwd>
<kwd>analytic validity</kwd>
</kwd-group>
<contract-num rid="cn001">HHSF223201610111C</contract-num>
<contract-sponsor id="cn001">U.S. Department of Health and Human Services<named-content content-type="fundref-id">10.13039/100000016</named-content>
</contract-sponsor>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>The intersection of data science, analytics, and precision medicine are now having an increasingly important role in the formation and delivery of health care, especially in cancer where the treatment regimens are complex and becoming more individualized (<xref ref-type="bibr" rid="B17">Ginsburg and Phillips, 2018</xref>). The National Research Council defined precision medicine as the ability to guide health care toward the most effective treatment for a given patient, improving quality and reducing the need for unnecessary diagnostic testing and therapies (<xref ref-type="bibr" rid="B36">National Research Council, 2011</xref>). Our understanding of the genomic basis of disease (cancer) is being transformed by the combination of next generation sequencing (NGS) and state-of-the-art computational data analysis, which are empowering the entry of innovative molecular assays into the clinic, and further enabling precision medicine (<xref ref-type="bibr" rid="B6">Berger and Mardis, 2018</xref>). Precision medicine is data science driven (<xref ref-type="bibr" rid="B17">Ginsburg and Phillips, 2018</xref>).</p>
<p>
<italic>Data science</italic> is a nascent, cross-disciplinary field that can be viewed as an amalgamation of classic disciplines. These include, but are not limited to: statistics, applied mathematics, and computer science, and importantly is focused on finding non-obvious and useful patterns from large datasets (<xref ref-type="bibr" rid="B28">Kelleher and Tierney, 2018</xref>). Data science seeks to find patterns and discriminators in order to support actionable decision making (<xref ref-type="bibr" rid="B10">Cao, 2017a</xref>; <xref ref-type="bibr" rid="B24">He and Lin, 2020</xref>). How can an insight be actionable? Except for domain-specific factors, the <italic>predictive power</italic> of an insight makes itself actionable (<xref ref-type="bibr" rid="B13">Dhar, 2013</xref>). A central tenet in science that distinctly extends into data science is <italic>accuracy,</italic> which is the quality or state of being correct or precise. It is also defined as simply the ratio of correctly predicted observations to the total observations, and is utilized to measure predictive&#x20;power.</p>
<p>Data science is enabling new and different understandings and reshaping several traditional fields (e.g., microbiology and microbiome, supply chain management, astronomy) into heavily data-driven disciplines (<xref ref-type="bibr" rid="B8">Borne, 2010</xref>; <xref ref-type="bibr" rid="B23">Hazen et&#x20;al., 2014</xref>; <xref ref-type="bibr" rid="B7">Bolyen et&#x20;al., 2019</xref>). The term &#x201c;<italic>Data Science</italic>&#x201d; is becoming increasingly associated with data sets massive in size, but there are additional challenges in this rapidly evolving field. Some factors considered to contribute to the challenges include: 1) <italic>data complexity</italic>, which refers to complicated data circumstances and characteristics, including the quality of data, largeness of scale, high dimensionality, and extreme imbalance; 2) the development of effective algorithms and, common task infrastructures and learning paradigms needed to handle various aspects of data; 3) the appropriate design of experiments; 4) proper translation mechanisms in order to present and visualize analytical results; 5) <italic>domain complexities</italic>, which refers to expert knowledge, hypotheses, meta-knowledge, etc., in the particular subject matter field (<xref ref-type="bibr" rid="B11">Cao, 2017b</xref>).</p>
<p>There is a known reproducibility problem in science. This was investigated and quantified by a survey conducted by the journal Nature involving over 1,500 scientists (<xref ref-type="bibr" rid="B4">Baker, 2016</xref>). The survey results reported that over 70% of researchers have tried and failed to reproduce another scientist&#x2019;s results and, more than half have failed to reproduce their own experiments. The survey also uncovered ambiguity concerning the exact definition of reproducibility and, this definition may be different depending on the scientific&#x20;field.</p>
<p>In data science, <italic>reproducibility</italic> is generally defined as the ability to re-compute data analytic results, with an observed dataset and requisite information regarding the analysis tools (<xref ref-type="bibr" rid="B40">Peng, 2015</xref>). Given reproducibility, independent researchers can build up evidence for or in contradiction to a scientific hypothesis (<xref ref-type="bibr" rid="B39">Peng, 2011</xref>; <xref ref-type="bibr" rid="B1">Aarts et&#x20;al., 2015</xref>). Some studies have suggested a large number of practical rules or methods for enhancing reproducibility in research (<xref ref-type="bibr" rid="B50">Sandve et&#x20;al., 2013</xref>; <xref ref-type="bibr" rid="B49">Rupprecht et&#x20;al., 2020</xref>). Nonetheless, in several fields, non-reproducibility is still an obstacle towards the better understanding of datasets, further blocking the path to new scientific discoveries (<xref ref-type="bibr" rid="B35">Mobley et&#x20;al., 2013</xref>; <xref ref-type="bibr" rid="B26">Iqbal et&#x20;al., 2016</xref>; <xref ref-type="bibr" rid="B19">Goodman et&#x20;al., 2018</xref>; <xref ref-type="bibr" rid="B55">Wen et&#x20;al., 2018</xref>). In addition, the current situation has forced us to face an awkward truth, that is, while our ability to generate data has grown dramatically, our ability to thoroughly understand data outputs has not developed at the same rate (<xref ref-type="bibr" rid="B40">Peng, 2015</xref>). Only if an analytical result is reproducible, can its accuracy be determined. The accuracy itself is based on evaluating the average performance of a series of analytical results from the same dataset. Then can we say such an analytical result is valid and has <italic>analytical validity</italic>. In other words, analytic validity can tell us how well the predictive power of an insight can be. Accuracy and reproducibility are cornerstones of analytical validity.</p>
<p>As more realize the implications and challenges presented by reproducibility in the field of biology, outstanding bioinformatics tools have been developed for improving the situation. To conquer the heterogeneities in bioinformatics tools, Bioconda (<xref ref-type="bibr" rid="B21">Gr&#xfc;ning et&#x20;al., 2018a</xref>) integrates more than 3,000 Conda tools. Docker based Dugong (<xref ref-type="bibr" rid="B33">Menegidio et&#x20;al., 2018</xref>) automates the installation of more than 3,500 bioinformatics tools. Pachyderm (<xref ref-type="bibr" rid="B37">Novella et&#x20;al., 2019</xref>) has been developed for managing complicated analyses including multiple stages and multiple tools. For specific studies, reproducible pipelines have been introduced: PiGx (<xref ref-type="bibr" rid="B56">Wurmus et&#x20;al., 2018</xref>) has been created for reproducible genomics analysis, whereas, QIIME 2 (<xref ref-type="bibr" rid="B7">Bolyen et&#x20;al., 2019</xref>) has been released for reproducible, interactive, scalable, and extensible microbiome data science. Finally, many researchers have utilized the web-based platform Galaxy (<xref ref-type="bibr" rid="B27">Jalili et&#x20;al., 2020</xref>) to facilitate collaborative and reproducible (<xref ref-type="bibr" rid="B20">Gr&#xfc;ning et&#x20;al., 2018b</xref>) biomedical analyses.</p>
<p>In genomic data science, to address reproducibility, improve scientific accuracy, and enhance collaboration, we present a robust software infrastructure and methodology that can encapsulate data, code, and reporting for large genomic studies. Our system is specifically focused on post-NGS pipeline (downstream) analysis, since it is at this juncture where collaborative endeavors arise focused on gleaning biological insights into studies employing one or more large and complex omics data sets. While the aforementioned tools each offer some methods for tackling the collaborative and reproducibility problems associated with pipeline software, none offer all the features and flexibility in our area of inquiry; post-pipeline (downstream) analysis collaboration and reproducibility. As an example, Galaxy is able to provide collaboration and reproducibility of downstream analyses, however, its ability to execute arbitrary code <italic>via</italic> a programming language of the researcher&#x2019;s choice&#x2014;if possible&#x2014;can be quite burdensome.</p>
<p>Our system is named NGS Post-pipeline Accuracy and Reproducibility System (NPARS) and its core technologies are graphically illustrated in <xref ref-type="fig" rid="F1">Figure&#x20;1</xref>. NPARS is different from other approaches. Specifically, it is the first to focus on the challenges associated with the accuracy, reproducibility, as well as, providing a more convenient manner of collaboration with colleagues. This is achieved by the ability of NPARS to encapsulate large and complex genomic datasets into a portable database container, which may then be analyzed by well-established APIs (Python/Jupyter Notebook, R/Rmd). The infrastructure first loads all data needed for subsequent analyses into a local lightweight (<xref ref-type="bibr" rid="B51">SQLite, 2021</xref>) database. The data is then captured within the database along with salient metadata into a schema, which can then be accessed <italic>via</italic> well-known open-source application programming interfaces. These include the use of Jupyter Notebooks (Python) (<xref ref-type="bibr" rid="B29">Kluyver et&#x20;al., 2016</xref>; <xref ref-type="bibr" rid="B44">Python Software Foundation, 2021</xref>), RProjects and RMarkdown (R) (<xref ref-type="bibr" rid="B3">Allaire et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B46">R-Project, 2021</xref>) with an aim to generate self-documenting source code, and results in portable formats. All software may be managed using Docker (<xref ref-type="bibr" rid="B34">Merkel, 2014</xref>) containers and Git (<xref ref-type="bibr" rid="B18">Git, 2021</xref>) (version control), simplifying configuration management.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Software technologies used for the NGS Post-pipeline Accuracy and Reproducibility System (NPARS) infrastructure creation. The six core technologies used are shown. <bold>(A)</bold> Study results from a genomics pipeline or repository are extracted and prepared for insertion into a SQLite database. <bold>(B)</bold> SQLite stores all genomic study outputs along with salient study metadata. <bold>(C)</bold> Git provides version control of the Dockerfiles (Docker image specification, i.e.,&#x20;analysis environment) and analysis source code. <bold>(D)</bold> Docker wraps the development environmental information into a container, simplifying software configuration management and, the initialization of a reproducible analysis environment. <bold>(E)</bold> RStudio, provides an integrated development environment for the R programming language and R Projects that are utilized, which provide an efficient way to organize software development activities. <bold>(F)</bold> RMarkdown generates self-documenting analytical reports into HTML files. <bold>(G)</bold> Jupyter Notebooks, are utilized as a development and visualization environment for Python-based projects and reports.</p>
</caption>
<graphic xlink:href="fdata-04-725095-g001.tif"/>
</fig>
</sec>
<sec sec-type="methods" id="s2">
<title>Methods</title>
<sec id="s2-1">
<title>Synthetic Data</title>
<p>Synthetic data was used in this study. All synthetic data was derived from actual human tumor tissue data sets (e.g., FastQ files). RNA-seq synthetic data was produced by RSEM (<xref ref-type="bibr" rid="B30">Li and Dewey, 2011</xref>). DNA-based synthetic data was produced through aggregation and averaging from a pool of human tumor samples. All FastQ files were initially created from BCL files using bcl2fastq2 v2.18.0.12 (<xref ref-type="bibr" rid="B5">bcl2fastq2 and bcl2fastq, 2021</xref>) and when needed or indicated, adapter trimming was performed during the conversion. FastQC v0.11.4 (<xref ref-type="bibr" rid="B15">FastQC, 2021</xref>) was used to assess the quality of all FastQ&#x20;files.</p>
</sec>
<sec id="s2-2">
<title>RNA Sequencing Pipeline</title>
<sec id="s2-2-1">
<title>Transcriptome Reconstruction and Gene-Level Count Qualification</title>
<p>STAR v2.5.3a (<xref ref-type="bibr" rid="B14">Dobin et&#x20;al., 2013</xref>) was used to align each sample&#x2019;s paired-end reads to the Ensembl Homo Sapiens reference genome build GRCh37.75, using STAR&#x2019;s &#x201c;2-pass&#x201d; method. Quality control and assessment of resulting BAM files was performed using QualiMap v2.2.1 (<xref ref-type="bibr" rid="B16">Garc&#xed;a-Alcalde et&#x20;al., 2012</xref>) and STAR output metrics. Picard v2.0.1 (<xref ref-type="bibr" rid="B43">Picard, 2021</xref>) was used to add read group information. The marking of duplicate reads and sorting of aligned files was also performed using Sambamba v0.6.5 (<xref ref-type="bibr" rid="B52">Tarasov et&#x20;al., 2015</xref>).</p>
<p>Each sample&#x2019;s BAM file was initially processed using StringTie v1.3.3b (<xref ref-type="bibr" rid="B42">Pertea et&#x20;al., 2015</xref>), along with Ensembl gene annotations to guide transcriptome reconstruction with novel transcript discovery enabled. Each patient&#x2019;s samples (i.e.,&#x20;study cohort) transcriptome was merged using StringTie&#x2019;s merge mode. Finally, the cohort&#x2019;s BAM files were processed using the newly created merged transcriptome. The StringTie option to output &#x201c;Ballgown-ready&#x201d; files was enabled.</p>
<p>Ballgown-ready files containing transcript coverage data was &#x201c;rolled-up&#x201d; to the gene-level and the R v4.0.3 (<xref ref-type="bibr" rid="B46">R-Project, 2021</xref>) library IsoformSwitchAnalyzeR v1.13.05 (<xref ref-type="bibr" rid="B54">Vitting-Seerup and Sandelin, 2019</xref>) was used to disambiguate novel findings from StringTie output. Unnormalized count data was extracted from IsoformSwitchAnalyzeR and used for downstream analysis.</p>
</sec>
<sec id="s2-2-2">
<title>RNA Expressed Mutation Calling and Gene Fusion Detection</title>
<p>RNA variants were called using the Broad Institute&#x2019;s GATK Best Practices for RNA-seq variant calling (<xref ref-type="bibr" rid="B9">Calling Variants in RNAseq, 2021</xref>). These steps include the following: STAR was used to align reads to the Ensembl Homo Sapiens reference genome (build GRCh37.75), using the recommended &#x201c;2-pass&#x201d; approach. Duplicates were marked and the aligned reads sorted with Sambamba. Next, the tool SplitNCigarReads [GATK v3.9 (<xref ref-type="bibr" rid="B32">McKenna et&#x20;al., 2010</xref>; <xref ref-type="bibr" rid="B12">DePristo et&#x20;al., 2011</xref>)] was used to split reads into exon segments, clip reads which overhang intronic regions, and assign a default MAPQ score of 60 to all reads. Variants were called using the HaplotypeCaller tool (GATK). Gene fusions were detected by passing FastQ files directory to STAR-Fusion v1.4.0 (<xref ref-type="bibr" rid="B22">Haas et&#x20;al., 2019</xref>).</p>
</sec>
</sec>
<sec id="s2-3">
<title>DNA Sequencing Pipeline</title>
<sec id="s2-3-1">
<title>Targeted Mutational Panel</title>
<p>FastQ files were submitted to the QIAGEN Data Analysis Center (<xref ref-type="bibr" rid="B45">QIAGEN, 2021</xref>) in a tumor/normal configuration and processed using the smCounter2 (<xref ref-type="bibr" rid="B57">Xu et&#x20;al., 2018</xref>) pipeline. The aforementioned pipeline generates aligned reads in BAM format and variants detected in VCF format. Quality control and assessment of resulting BAM files was performed using QualiMap.</p>
</sec>
<sec id="s2-3-2">
<title>Low-Pass Whole Genome Copy Number Variation</title>
<p>Each sample&#x2019;s FastQ paired-end files were aligned to the Ensembl Homo Sapiens reference genome (build GRCh37.75) using BWA v0.7.12 (<xref ref-type="bibr" rid="B31">Li and Durbin, 2009</xref>). Quality control and assessment of BAM files was performed with QualiMap. BAM files were post-processed to mark duplicates and sort aligned reads (Sambamba). Copy number data was computational inferred using the R library ichorCNA v0.2.0 (<xref ref-type="bibr" rid="B2">Adalsteinsson et&#x20;al., 2017</xref>).</p>
</sec>
</sec>
<sec id="s2-4">
<title>Post-pipeline Reproducible Data Science Software Infrastructure</title>
<p>NPARS was implemented using the following software packages: Python v2.7.5/3.7.1; Jupyter Notebooks v6.3.0; IPython v7.22.0 (<xref ref-type="bibr" rid="B41">P&#xe9;rez and Granger, 2007</xref>); R v4.1.0; RStudio v1.4.1717 (<xref ref-type="bibr" rid="B48">RStudio, 2020</xref>); RMarkdown v.2.7; SQLite v3.35; Docker v20.10.3; and Git v2.26.2 (<xref ref-type="bibr" rid="B18">Git,&#x20;2021</xref>).</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>Results</title>
<sec id="s3-1">
<title>NPARS Overview and Workflow</title>
<p>
<xref ref-type="fig" rid="F2">Figure&#x20;2</xref> illustrates an overview and workflow for NPARS. First, the data associated with the study of interest is identified. This may be performed from either a central database/repository or directly from pipeline output files as shown in <bold>subfigure (A)</bold>. Next, custom Python scripts are used to perform extraction and transform operations on the pipeline outputs and associated metadata <bold>(B)</bold>. The result is to produce a set of standardized/structured output files, i.e.,&#x20;well-formatted comma-separated files <bold>(C)</bold>. A Python script <bold>(D)</bold> imports the structured output files into the local SQLite database containing a well-defined schema to hold the data. The SQLite database <bold>(E)</bold>, is a light-weighted and easily portable&#x20;database, and is utilized to store the study&#x2019;s data and metadata in a well-organized manner. Well known and regarded APIs (RProject and R-Markdown, Jupyter notebooks) are utilized to interface <bold>(F)</bold> to the SQLite database for analysis type activities.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>NPARS Overview and Workflow. <bold>(A)</bold> Genomic pipeline output for a particular study of interest is identified. This output can be stored in a database(s) and/or in output files. <bold>(B)</bold> A Python script extracts the identified study results and transforms them into well-defined structured output files. <bold>(C)</bold> The structured output files contain all data and metadata to be imported into the SQLite database. <bold>(D)</bold> A Python script imports the structured output files into the local SQLite database, which already has a well-defined schema to hold the data. <bold>(E)</bold> The SQLite database stores the scientific study data and metadata in a well-organized manner. <bold>(F)</bold> The only interface between the user and the data, is through the particular SQLite API for that development environment. For example, R provides the RSQLite library that provides access to the data. <bold>(G)</bold> Each analysis environment is an abstraction (container) within a Docker container and the source code for it is checked into Git. Self-documenting coding technologies such as R/RMarkdown and Python/Jupyter Notebooks, are used to perform the desired analyses. <bold>(H)</bold> Reproducible reports/analyses are generated, that are both portable and reproducible.</p>
</caption>
<graphic xlink:href="fdata-04-725095-g002.tif"/>
</fig>
<p>Docker images are utilized to &#x201c;spin-up&#x201d; containers, which contain installations of an analysis environment <bold>(G)</bold>. For example, a Docker image containing an R/RStudio environment was created, which includes the necessary libraries (e.g., RMarkdown, DESeq2, etc.) to perform exploratory data analysis (EDA) and differential gene expression on a given study of interest. Python utilizing Jupyter Notebooks is another example analysis environment. Other analysis environments can be easily &#x201c;Dockerized&#x201d;, or encapsulate the analysis environment within a Docker image in order to offer the desired functionality. NPARS can also be run without Docker.</p>
<p>Docker image specifications are checked into a Git repository in the Dockerfile format, to allow images to be easily shared and to provide version control of the analysis environments and their dependencies. This greatly aids the ultimate goal of NPARS, which is reproducible output <bold>(H)</bold>. Version controlled analysis source code, can interface directly with a SQLite database <italic>via</italic> well-defined, open-source interfaces provided by the software framework of choice. For example, the R library RSQLite (<xref ref-type="bibr" rid="B47">RSQLite, 2021</xref>) may be used to directly query the data to be analyzed from the SQLite database. Finally, given the SQLite database along with access to the Git repository containing the Docker specification and source code, any collaborator may generate a reproducible, complete analysis environment, as well as, analysis results from self-documenting RMarkdown or Jupyter Notebooks.</p>
</sec>
<sec id="s3-2">
<title>Database Schema</title>
<p>The SQLite database utilized by the NPARS is displayed in <xref ref-type="fig" rid="F3">Figure&#x20;3</xref> and contains several groups of major tables. The entity relationship model illustrates the metadata and genomics study data within the context of the database schema. The <italic>Study Meta Data</italic> table (<bold>subfigure A</bold>) provides an essential repository of metadata, as well as means of central connection to the other database tables <italic>via</italic> a combination of primary and secondary keys. The <italic>DNA Mutations</italic> table (<bold>B</bold>) contains NGS mutational data from a targeted&#x20;panel.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Entity Relationship (ER) Model for the SQLite database utilized in NPARS. Metadata and genomics study data are shown within the context of the database schema. <bold>(A)</bold> Study metadata table (&#x201c;Study Meta Data&#x201d;), provides a central repository of metadata, and means of connection to the rest of the tables <italic>via</italic> primary and foreign keys. <bold>(B)</bold> DNA mutations table [&#x201c;DNA Mutations (Panel)&#x201d;] contains mutational data from a targeted DNA NGS panel. <bold>(C)</bold> Three tables store copy number variation (CNV) data (&#x201c;CNV Segmented&#x201d;), where each CNV segment is a range of chromosome bases of similar copy number value. Each CNV segment is associated with possibly many genes within it (&#x201c;CNV Genes&#x201d;), and with possibly many cytobands (&#x201c;CNV Band&#x201d;). <bold>(D)</bold> The four tables which hold RNA-based study data: isoform count (&#x201c;RNA Isoform Count&#x201d;), gene fusions (&#x201c;RNA Gene Fusion&#x201d;), gene count (&#x201c;RNA Gene Count&#x201d;) and, expressed mutations (&#x201c;RNA Expressed Mutations&#x201d;).</p>
</caption>
<graphic xlink:href="fdata-04-725095-g003.tif"/>
</fig>
<p>The remaining tables house six different types of genomic data results. Tables that contain the copy number variation data derived from DNA using an ultra-low-pass whole genome sequencing approach are shown in purple (<bold>C</bold>). As part of the ultra-low-pass approach, copy number data is segmented into chromosomal regions of similar copy number status (<italic>CNV Segmented</italic>) and, each segment/locus is annotated <italic>via</italic> one-to-many relationships with associated genes, (<italic>CNV Genes</italic>) and, associated cytobands (<italic>CNV Band</italic>). Genomic study results include a variety of RNA-based results, which are shown in light blue (<bold>D</bold>). These include isoform count data (<italic>RNA Isoform Count</italic>), gene fusion data (<italic>RNA Gene Fusion</italic>), gene count data (<italic>RNA Gene Count</italic>) which is essentially &#x201c;rolled up&#x201d; isoform count data and, expressed mutations data (<italic>RNA Expressed Mutations</italic>).</p>
</sec>
<sec id="s3-3">
<title>Data Analyses</title>
<p>NPARS can generate a wide variety of plots and tables for the purposes of EDA and/or other user-specific analyses, such as finding differentially expressed genes (DEGs). Here we disseminate some examples of reproducible analyses results that were performed on the samples (a total of 21 different NGS experiments yielding large and complex multi-omic datasets), which were described in the Methods section. EDA is an approach for analyzing datasets, summarizing, and showing their main statistical properties in graphics or other data visualization algorithms (<xref ref-type="bibr" rid="B53">Tukey, 1977</xref>). <xref ref-type="sec" rid="s10">Supplementary Figure S1</xref>, displays a few examples used in NPARS for RNA-seq data. <bold>Subfigure A</bold> shows violin and box plots displaying the distribution of read counts for the replicates of three classes of samples colored blue, green and maroon. In this example each sample class contains three replicates. Next a principal component analysis plot <bold>(B)</bold> of the samples begins to explore the data. The three tissue types used in this study are circled and color coded. The two principal components explain 72% of the variation. <bold>(C)</bold> A hierarchical clustering analysis (HCA) with heatmap of mean normalized counts, showing the top 20 most variable genes on the <italic>y</italic>-axis, and the three tissue types along with their three replicates colored and listed along the <italic>x</italic>-axis. It is known that tissue types T2 and T3 are biologically similar. Tissue type T1 is known to be biologically different from T2 and T3, and this is reflected in the dendrogram.</p>
<p>In addition to traditional EDA plots, the R library RCircos v.1.2.1 (<xref ref-type="bibr" rid="B59">Zhang et&#x20;al., 2013</xref>) was used in NPARS to visualize multiple NGS studies in a single plot (<xref ref-type="sec" rid="s10">Supplementary Figure S2</xref>). From the outermost ring inward this figure is composed of: <bold>i.</bold> human chromosomal ideogram, <bold>ii.</bold> DNA panel mutations (tumor vs. germline), <bold>iii</bold>. RNA expressed mutations from the full transcriptome, <bold>iv</bold>. whole genome DNA copy number variations (tumor vs. germline) colored according to the legend symbols that denote amplification, normal, or deletion, <bold>v</bold>. RNA gene expression (TPM) and, <bold>vi</bold>. RNA gene fusions.</p>
<p>Differential gene expression (DGE) analysis takes normalized RNA-based read count data and performs a statistical analysis, to find quantitative changes in expression levels between different experimental groups. A DGE analysis report is generated by NPARS, and an abbreviated example output is shown in <xref ref-type="sec" rid="s10">Supplementary Tables S1, 2</xref>. This information was produced as part of a RSQLite query. The novel gene findings report (<xref ref-type="sec" rid="s10">Supplementary Table S1</xref>) is discussed. Subtable <bold>A</bold> shows columns for the following: <bold>i.</bold> predicted novel gene (ID), <bold>ii</bold>. locus, <bold>iii</bold>. gene name corresponding to the nearest annotated gene <bold>iv.</bold> log2 fold change (case over control), <bold>v.</bold> <italic>p</italic>-value, and <bold>vi</bold>. adjusted <italic>p</italic>-value. Subtable <bold>B</bold> displays: <bold>i.</bold> predicted novel gene (ID), <bold>ii.</bold> Case sample mean normalized count (<italic>via</italic> replicates), <bold>iii.</bold> Case sample standard deviation (replicates), <bold>iv.</bold> control sample mean normalized count (replicates) and, <bold>v</bold>. control sample standard deviation (replicates).</p>
<p>
<xref ref-type="sec" rid="s10">Supplementary Table S2</xref> illustrates an abbreviated example report for annotated gene findings. Subtable <bold>A</bold> shows columns for the following: <bold>i.</bold> annotated gene (ID), <bold>ii.</bold> gene symbol, <bold>iii</bold>. locus, <bold>iv.</bold> strand information, <bold>v.</bold> log2 fold change (case over control), <bold>vi.</bold> <italic>p</italic>-value and, <bold>vii</bold>. adjusted <italic>p</italic>-value. Subtable <bold>B</bold> shows columns for the following: <bold>i.</bold> annotated gene (ID), <bold>ii.</bold> Case sample mean normalized count (<italic>via</italic> replicates), <bold>iii</bold>. Case sample standard deviation (replicates), <bold>iv.</bold> control sample mean normalized count (replicates) and, <bold>v.</bold> control sample standard deviation (replicates).</p>
<p>An example of an abbreviated copy number variation (CNV) report derived from an ultra-low-pass whole genome (tumor/germline) NGS approach and processed by the ichor package, was generated by NPARS and is displayed in <xref ref-type="sec" rid="s10">Supplementary Table S3</xref>. The table is produced as part of a RSQLite query and shows columns for the following: <bold>i.</bold> gene symbol, <bold>ii.</bold> annotated gene (ID) per Ensembl, <bold>iii</bold>. Chromosome number, <bold>iv.</bold> Chromosomal segment start position, <bold>v.</bold> chromosomal segment end position, <bold>vi.</bold> median logR, where logR &#x3d; log2 (T1/Germline), <bold>vii.</bold> subclone status, meaning is the amplication or deletion event part of a subclone per the ichor package <bold>viii.</bold> copy number, <bold>ix.</bold> copy number type and, <bold>x.</bold> cytoband. This report shows a small example of salient CNV findings from a small selection of&#x20;genes.</p>
<p>A Python/Jupyter Notebook utilizing a library from scikit-learn (<xref ref-type="bibr" rid="B38">Pedregosa et&#x20;al., 2011</xref>) was used to generate the <italic>clustergram</italic> plot in <xref ref-type="sec" rid="s10">Supplementary Figure S3</xref> by NPARS. This approach is used as part of finding the optimal number of clusters for a K-Means analysis. RNA-seq data normalized across three sample types using DESeq2 were used in this example. The <italic>x</italic>-axis displays the number of clusters (k) during an iteration of k-means clustering analysis, and the <italic>y</italic>-axis displays the PCA weighted mean of the clusters. Each point (red dot) represents the center of a cluster and, the size of each point represents the amount of information contained in each cluster. The thickness of lines (blue) connecting points represent observations potentially moving between clusters. In this example per the clustergram plot the optimal number of clusters should be 2 or&#x20;3.</p>
<p>To further investigate the optimal number of clusters for K-Means, <italic>silhouette coefficient plots</italic> (<xref ref-type="bibr" rid="B60">Zhou and Gao, 2014</xref>) were performed using the Python/Jupyter Notebook code employing scikit-learn and shown in <xref ref-type="sec" rid="s10">Supplementary Figure S4</xref>. Shown are a series of silhouette plots, which graphically evaluate a variety k-means cluster configurations (2 through 7) along with corresponding silhouette coefficients and threshold value (dotted red vertical line). The value of a silhouette coefficient (<italic>x</italic>-axis) ranges from -1 to 1, the higher the value indicates greater cohesion within the cluster and greater separation between clusters. A negative value indicates a possible improper cluster assignment and, a zero value indicates the object assignment is between clusters. The higher the coefficient value, the more separated and clearly identifiable is the particular cluster. The thickness of each cluster silhouette (<italic>y</italic>-axis, associated with the cluster label) indicates the cluster size. <bold>(A)</bold> Silhouette analysis for k-means clustering on sample data with 2 clusters. <bold>(B)</bold> Silhouette analysis for k-means clustering on sample data with 3 clusters. In this case the new cluster (cluster label 2) has a zero coefficient value meaning it is not significant. <bold>(C)</bold> Silhouette analysis for k-means clustering on sample data with 4 clusters. This plot shows cluster labels 2 and 3 are not significant. <bold>(D)</bold> Silhouette analysis for k-means clustering on sample data with 5 clusters. <bold>(E)</bold> Silhouette analysis for k-means clustering on sample data with 6 clusters. <bold>(F)</bold> Silhouette analysis for k-means clustering on sample data with 7 clusters. According to the plots, the optimal cluster number should be 2. A confluence of evidence based on this evaluation and the previous (clustergram) is indicating the optimal k-means cluster value may be 2. Datasets used to generate the plots are the same simulated data which were used to generate the clustergram plot (<xref ref-type="sec" rid="s10">Supplementary Figure S3</xref>). A Jupyter/Python Notebook was used to perform this analysis.</p>
<p>Based on the prior results from the <italic>clustergram</italic> and <italic>silhouette coefficient plots,</italic> k-means was run twice, once with two clusters, and then three clusters. <xref ref-type="sec" rid="s10">Supplementary Figure S5</xref> contains results obtained from the Python/Jupyter Notebook code for this analysis, with k-means and two clusters (<bold>A</bold>), and three clusters (<bold>B</bold>). The same RNA-seq data processed by DESeq2 was used. The plot shapes indicate the cluster membership labels: 0, 1, 2 and, the colors represent the tissue types, T1 (Tissue 1, blue), T2 (Tissue 2, orange), T3 (Tissue 3, green). A small red circle is used to highlight the primary difference between the two plots, namely, a new cluster is formed from T1. Analyzing plots <bold>A</bold> and <bold>B</bold>, it appears that two clusters may more efficiently group the data versus three clusters and, supports the results of the <italic>silhouette plots</italic> (<xref ref-type="sec" rid="s10">Supplementary Figure S4</xref>) and, is also in agreement with the <italic>clustergram</italic> plot (<xref ref-type="sec" rid="s10">Supplementary Figure&#x20;S3</xref>).</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>Discussion</title>
<p>The next evolution in oncology research and cancer care are being driven by data science (<xref ref-type="bibr" rid="B58">Yu and Kibbe, 2021</xref>). So, it is of paramount importance to address current accuracy and reproducibility issues. In the field of genomic data science, accuracy and reproducibility remains a considerable challenge due to the sheer size, complexity, and dynamic nature plus relative inventiveness of the quantitative biology approaches. The accuracy and reproducibility challenge does not just block the path to new scientific discoveries, more importantly, it may lead to a scenario where critical findings used for medical decision making are found to be incorrect (<xref ref-type="bibr" rid="B25">Huang and Gottardo, 2013</xref>). NPARS has been developed to meet the unmet need of improving accuracy and reproducibility in genomic data science. Currently, a limitation of our system is the requirement of the user to put their data into a standardized format for import into NPARS. These steps are not automated.</p>
<p>An accuracy and reproducibility test of NPARS was performed by running the R/RMarkdown and Python Jupyter Notebook code with the SQLite database on two different systems, 1) Windows 10-based system and, 2) system utilizing the Ubuntu Linux distribution. The results demonstrated the use of NPARS on two different systems produced identical outputs and this is summarized in <xref ref-type="table" rid="T1">Table&#x20;1</xref>. Here, the term &#x201c;Passed&#x201d; means the observed and expected outputs were identical on the respective systems. The R/RMarkdown outputs were first compared. The RCircos graphic (<xref ref-type="sec" rid="s10">Supplementary Figure S2</xref>), which summarizes and integrates seven genomics studies into a single graphical plot was visually inspected from the Windows and Linux systems and found to be identical. <xref ref-type="sec" rid="s10">Supplementary Tables S1A,B, 2A,B, 3</xref> were also identical. All EDA graphics from <xref ref-type="sec" rid="s10">Supplementary Figure S2</xref> were compared by visual inspection and found to be identical. For the analyses performed by Python/Jupyter Notebook, the <italic>clustergram</italic> (<xref ref-type="sec" rid="s10">Supplementary Figure S3</xref>), <italic>silhouette coefficient</italic> plots (<xref ref-type="sec" rid="s10">Supplementary Figure S4</xref>) and k-means graphics (<xref ref-type="sec" rid="s10">Supplementary Figure S5</xref>) were regenerated on each system, compared by close visual inspection and found to be identical.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>NPARS Accuracy and Reproducibility Testing Summary.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Analysis test</th>
<th align="center">System &#x23;1, Windows 10</th>
<th align="center">System &#x23;2, Linux/Ubuntu</th>
<th align="center">Comparative results (system &#x23;1 vs. System &#x23;2)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">RCircos, <xref ref-type="sec" rid="s10">Supplementary Figure S2</xref>
</td>
<td align="left">Passed</td>
<td align="left">Passed</td>
<td align="left">Identical</td>
</tr>
<tr>
<td align="left">DESeq2 Novel Genes, <xref ref-type="sec" rid="s10">Supplementary Table S1</xref>
</td>
<td align="left">Passed</td>
<td align="left">Passed</td>
<td align="left">Identical</td>
</tr>
<tr>
<td align="left">DeSeq2 Annotated Genes, <xref ref-type="sec" rid="s10">Supplementary Table S2</xref>
</td>
<td align="left">Passed</td>
<td align="left">Passed</td>
<td align="left">Identical</td>
</tr>
<tr>
<td align="left">Copy Number Analysis, <xref ref-type="sec" rid="s10">Supplementary Table S3</xref>
</td>
<td align="left">Passed</td>
<td align="left">Passed</td>
<td align="left">Identical</td>
</tr>
<tr>
<td align="left">Violin Plots, <xref ref-type="sec" rid="s10">Supplementary Figure S1</xref>
</td>
<td align="left">Passed</td>
<td align="left">Passed</td>
<td align="left">Identical</td>
</tr>
<tr>
<td align="left">Box Plots, <xref ref-type="sec" rid="s10">Supplementary Figure S1</xref>
</td>
<td align="left">Passed</td>
<td align="left">Passed</td>
<td align="left">Identical</td>
</tr>
<tr>
<td align="left">PCA Plot, <xref ref-type="sec" rid="s10">Supplementary Figure S1</xref>
</td>
<td align="left">Passed</td>
<td align="left">Passed</td>
<td align="left">Identical</td>
</tr>
<tr>
<td align="left">HCA Plot, <xref ref-type="sec" rid="s10">Supplementary Figure S1</xref>
</td>
<td align="left">Passed</td>
<td align="left">Passed</td>
<td align="left">Identical</td>
</tr>
<tr>
<td align="left">Clustergram, <xref ref-type="sec" rid="s10">Supplementary Figure S3</xref>
</td>
<td align="left">Passed</td>
<td align="left">Passed</td>
<td align="left">Identical</td>
</tr>
<tr>
<td align="left">Silhouette Coefficient Plots, <xref ref-type="sec" rid="s10">Supplementary Figure S4</xref>
</td>
<td align="left">Passed</td>
<td align="left">Passed</td>
<td align="left">Identical</td>
</tr>
<tr>
<td align="left">K-means Plots, <xref ref-type="sec" rid="s10">Supplementary Figure S5</xref>
</td>
<td align="left">Passed</td>
<td align="left">Passed</td>
<td align="left">Identical</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The first column, &#x201c;Analysis Test&#x201d; lists the name of each test along with corresponding supplemental figure or table information. The columns &#x201c;System &#x23;1, Windows-10&#x201d; and &#x201c;System &#x23;2, Linux/Ubuntu&#x201d; lists the results of each test run on these respective systems. The column titled &#x201c;Comparative Results (System &#x23;1 vs. System &#x23;2) reports the comparative results outcome. The term &#x201c;Passed&#x201d; means the observed and expected outputs were the same on the respective systems.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The innovative and evolving landscape of oncology research and cancer care are dependent on accurate, reproducible, and robust data science. High-throughput instrumentation are generating increasingly massive and complex genomic data sets, and continue to create opportunities and challenges in the dynamic&#x20;field of genomic data science. This makes collaboration, verification, validation, and reproducibility of findings difficult.&#x20;To address these challenges NPARS was developed. NPARS is the first system to focus on NGS downstream analysis accuracy, reproducibility, and enhancing collaboration, by effectively capturing large and complex genomic datasets into a portable database container and exposing it to well-established APIs. In this paper we have profiled and demonstrated NPARS, which is a robust software infrastructure and methodology that can encapsulate both data, code, and reporting for large genomic studies. This study demonstrates the successful use of NPARS on large and complex genomic data sets across different computational platforms and begins to address the prevailing challenges of accuracy and reproducibility in genomic data science.</p>
</sec>
</body>
<back>
<sec id="s5">
<title>Data Availability Statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: <ext-link ext-link-type="uri" xlink:href="https://gitlab.com/erichpeterson/npars-analysis">https://gitlab.com/erichpeterson/npars-analysis</ext-link>.</p>
</sec>
<sec id="s6">
<title>Author Contributions</title>
<p>DJ conceived the project. DJ, LM, EP devised the experiments. LM and EP performed the software implementation. IS performed laboratory experiments. MS, JM, KM coordinated laboratory experiments. DJ, EP, LM wrote the manuscript. All authors read and approved the manuscript.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>The authors would like to acknowledge the financial support of the United&#x20;States Department of Health and Human Services, Food and Drug Administration, contract HHSF223201610111C through the Arkansas Research Alliance.</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s10">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fdata.2021.725095/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fdata.2021.725095/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.PDF" id="SM1" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Aarts</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Anderson</surname>
<given-names>J.&#x20;E.</given-names>
</name>
<name>
<surname>Anderson</surname>
<given-names>C. J.</given-names>
</name>
<name>
<surname>Attridge</surname>
<given-names>P. R.</given-names>
</name>
<name>
<surname>Attwood</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Axt</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Estimating the Reproducibility of Psychological Science</article-title>. <source>Science</source> <volume>349</volume> (<issue>6251</issue>):<fpage>aac4716</fpage>. <pub-id pub-id-type="doi">10.1126/science.aac4716</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Adalsteinsson</surname>
<given-names>V. A.</given-names>
</name>
<name>
<surname>Ha</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Freeman</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Choudhury</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Stover</surname>
<given-names>D. G.</given-names>
</name>
<name>
<surname>Parsons</surname>
<given-names>H. A.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Scalable Whole-Exome Sequencing of Cell-free DNA Reveals High Concordance with Metastatic Tumors</article-title>. <source>Nat. Commun.</source> <volume>8</volume> (<issue>1</issue>), <fpage>1324</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-017-00965-y</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Allaire</surname>
<given-names>J.&#x20;J.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>McPherson</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Luraschi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ushey</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Atkins</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <source>Rmarkdown: Dynamic Documents for R</source>.</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Baker</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>1,500 Scientists Lift the Lid on Reproducibility</article-title>. <source>Nature</source> <volume>533</volume>, <fpage>452</fpage>&#x2013;<lpage>454</lpage>. <pub-id pub-id-type="doi">10.1038/533452a</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="web">
<collab>bcl2fastq2 and bcl2fastq</collab> (<year>2021</year>). <article-title>bcl2fastq2 and Bcl2fastq Conversion Software Downloads</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://support.illumina.com/sequencing/sequencing_software/bcl2fastq-conversion-software/downloads.html">https://support.illumina.com/sequencing/sequencing_software/bcl2fastq-conversion-software/downloads.html</ext-link>
</comment> </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Berger</surname>
<given-names>M. F.</given-names>
</name>
<name>
<surname>Mardis</surname>
<given-names>E. R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>The Emerging Clinical Relevance of Genomics in Cancer Medicine</article-title>. <source>Nat. Rev. Clin. Oncol.</source> <volume>15</volume> (<issue>6</issue>), <fpage>353</fpage>&#x2013;<lpage>365</lpage>. <pub-id pub-id-type="doi">10.1038/s41571-018-0002-6</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bolyen</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Rideout</surname>
<given-names>J.&#x20;R.</given-names>
</name>
<name>
<surname>Dillon</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Bokulich</surname>
<given-names>N. A.</given-names>
</name>
<name>
<surname>Abnet</surname>
<given-names>C. C.</given-names>
</name>
<name>
<surname>Al-Ghalith</surname>
<given-names>G. A.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Reproducible, Interactive, Scalable and Extensible Microbiome Data Science Using QIIME 2</article-title>. <source>Nat. Biotechnol.</source> <volume>37</volume>, <fpage>852</fpage>&#x2013;<lpage>857</lpage>. <pub-id pub-id-type="doi">10.1038/s41587-019-0209-9</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Borne</surname>
<given-names>K. D.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Astroinformatics: Data-Oriented Astronomy Research and Education</article-title>. <source>Earth Sci. Inform.</source> <volume>3</volume>, <fpage>5</fpage>&#x2013;<lpage>17</lpage>. <pub-id pub-id-type="doi">10.1007/s12145-010-0055-2</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="web">
<collab>Calling Variants in RNAseq</collab> (<year>2021</year>). <article-title>Calling Variants in RNAseq: Methods and Workflows</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.broadinstitute.org/gatk/guide/article?id=3891">https://www.broadinstitute.org/gatk/guide/article?id&#x3d;3891</ext-link>
</comment> </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Data Science : A Comprehensive Overview</article-title>. <source>ACM Comput. Surv.</source> <volume>50</volume> (<issue>3</issue>). <fpage>1</fpage>&#x2013;<lpage>42</lpage>. <pub-id pub-id-type="doi">10.1145/3076253</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Data Science</article-title>. <source>Commun. ACM</source> <volume>60</volume>, <fpage>59</fpage>&#x2013;<lpage>68</lpage>. <pub-id pub-id-type="doi">10.1145/3015456</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>DePristo</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Banks</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Poplin</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Garimella</surname>
<given-names>K. V.</given-names>
</name>
<name>
<surname>Maguire</surname>
<given-names>J.&#x20;R.</given-names>
</name>
<name>
<surname>Hartl</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>A Framework for Variation Discovery and Genotyping Using Next-Generation DNA Sequencing Data</article-title>. <source>Nat. Genet.</source> <volume>43</volume> (<issue>5</issue>), <fpage>491</fpage>&#x2013;<lpage>498</lpage>. <pub-id pub-id-type="doi">10.1038/ng.806</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dhar</surname>
<given-names>V. N. Y. U.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Data Science and Prediction</article-title>. <source>Commun. ACM</source> <volume>56</volume> (<issue>12</issue>):<fpage>64</fpage>&#x2013;<lpage>73</lpage>. <pub-id pub-id-type="doi">10.1145/2500499</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dobin</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Davis</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>Schlesinger</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Drenkow</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zaleski</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Jha</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>STAR: Ultrafast Universal RNA-Seq Aligner</article-title>. <source>Bioinformatics</source> <volume>29</volume> (<issue>1</issue>), <fpage>15</fpage>&#x2013;<lpage>21</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bts635</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="web">
<collab>FastQC</collab> (<year>2021</year>). <article-title>A Quality Control Tool for High Throughput Sequence Data</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/">http://www.bioinformatics.babraham.ac.uk/projects/fastqc/</ext-link>
</comment> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Garc&#xed;a-Alcalde</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Okonechnikov</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Carbonell</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cruz</surname>
<given-names>L. M.</given-names>
</name>
<name>
<surname>G&#xf6;tz</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Tarazona</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>Qualimap: Evaluating Next-Generation Sequencing Alignment Data</article-title>. <source>Bioinformatics</source> <volume>28</volume> (<issue>20</issue>), <fpage>2678</fpage>&#x2013;<lpage>2679</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bts503</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ginsburg</surname>
<given-names>G. S.</given-names>
</name>
<name>
<surname>Phillips</surname>
<given-names>K. A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Precision Medicine: From Science to Value</article-title>. <source>Health Aff.</source> <volume>37</volume> (<issue>5</issue>), <fpage>694</fpage>&#x2013;<lpage>701</lpage>. <pub-id pub-id-type="doi">10.1377/hlthaff.2017.1624</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="web">
<collab>Git</collab> (<year>2021</year>). <article-title>Git</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://git-scm.com/">https://git-scm.com/</ext-link>
</comment> </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goodman</surname>
<given-names>S. N.</given-names>
</name>
<name>
<surname>Fanelli</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ioannidis</surname>
<given-names>J.&#x20;P.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>What Does Research Reproducibility Mean?</article-title> <source>Sci. Transl Med.</source> <volume>8</volume>, <fpage>341ps12</fpage>&#x2013;<lpage>102</lpage>. <pub-id pub-id-type="doi">10.1126/scitranslmed.aaf5027</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gr&#xfc;ning</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Chilton</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>K&#xf6;ster</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Dale</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Soranzo</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>van den Beek</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Practical Computational Reproducibility in the Life Sciences</article-title>. <source>Cel Syst.</source> <volume>6</volume> (<issue>6</issue>), <fpage>631</fpage>&#x2013;<lpage>635</lpage>. <pub-id pub-id-type="doi">10.1016/j.cels.2018.03.014</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gr&#xfc;ning</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Dale</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Dale</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Sj&#xf6;din</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chapman</surname>
<given-names>B. A.</given-names>
</name>
<name>
<surname>Rowe</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Bioconda: Sustainable and Comprehensive Software Distribution for the Life Sciences</article-title>. <source>Nat. Methods</source> <volume>15</volume> (<issue>7</issue>), <fpage>475</fpage>&#x2013;<lpage>476</lpage>. <pub-id pub-id-type="doi">10.1038/s41592-018-0046-7</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Haas</surname>
<given-names>B. J.</given-names>
</name>
<name>
<surname>Dobin</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Stransky</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Pochet</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Regev</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Accuracy Assessment of Fusion Transcript Detection <italic>via</italic> Read-Mapping and De Novo Fusion Transcript Assembly-Based Methods</article-title>. <source>Genome Biol.</source> <volume>20</volume> (<issue>1</issue>), <fpage>213</fpage>. <pub-id pub-id-type="doi">10.1186/s13059-019-1842-9</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hazen</surname>
<given-names>B. T.</given-names>
</name>
<name>
<surname>Boone</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>Ezell</surname>
<given-names>J.&#x20;D.</given-names>
</name>
<name>
<surname>Jones-Farmer</surname>
<given-names>L. A.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Data Quality for Data Science, Predictive Analytics, and Big Data in Supply Chain Management: An Introduction to the Problem and Suggestions for Research and Applications</article-title>. <source>Int. J.&#x20;Prod. Econ.</source> <volume>154</volume>, <fpage>72</fpage>&#x2013;<lpage>80</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijpe.2014.04.018</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Challenges and Opportunities in Statistics and Data Science: Ten Research Areas</article-title>. <source>Harv. Data Sci. Rev.</source> <pub-id pub-id-type="doi">10.1162/99608f92.95388fcb</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Gottardo</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Comparability and Reproducibility of Biomedical Data</article-title>. <source>Brief. Bioinform.</source> <volume>14</volume>, <fpage>391</fpage>&#x2013;<lpage>401</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbs078</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Iqbal</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Wallach</surname>
<given-names>J.&#x20;D.</given-names>
</name>
<name>
<surname>Khoury</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Schully</surname>
<given-names>S. D.</given-names>
</name>
<name>
<surname>Ioannidis</surname>
<given-names>J.&#x20;P.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Reproducible Research Practices and Transparency across the Biomedical Literature</article-title>. <source>Plos Biol.</source> <volume>14</volume>, <fpage>e1002333</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1371/journal.pbio.1002333</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jalili</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Afgan</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Clements</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Blankenberg</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Goecks</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>The Galaxy Platform for Accessible, Reproducible and Collaborative Biomedical Analyses: 2020 Update</article-title>. <source>Nucleic Acids Res.</source> <volume>48</volume> (<issue>W1</issue>), <fpage>W395</fpage>&#x2013;<lpage>W402</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkaa434</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kelleher</surname>
<given-names>J.&#x20;D.</given-names>
</name>
<name>
<surname>Tierney</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2018</year>). <source>Data Science</source>. <publisher-loc>Cambridge</publisher-loc>, <publisher-name>MIT Press</publisher-name>.</citation>
</ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kluyver</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ragan-Kelley</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>P&#xe9;rez</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Granger</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Bussonnier</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Frederic</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kelley</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hamrick</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Grout</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Corlay</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Jupyter Notebooks&#x2014;A Publishing Format for Reproducible Computational Workflows</article-title>. <conf-name>Positioning and Power in Academic Publishing: Players, Agents and Agendas - Proceedings of the 20th International Conference on Electronic Publishing, ELPUB 2016</conf-name>. <publisher-loc>G&#x00f6;ttingen, Germany</publisher-loc>: <publisher-name>Electronic Publishing</publisher-name>, <fpage>87</fpage>&#x2013;<lpage>90</lpage>. </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Dewey</surname>
<given-names>C. N.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>RSEM: Accurate Transcript Quantification from RNA-Seq Data with or without a Reference Genome</article-title>. <source>BMC bioinformatics</source> <volume>12</volume> (<issue>1</issue>), <fpage>1</fpage>&#x2013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-12-323</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Durbin</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Fast and Accurate Short Read Alignment with Burrows-Wheeler Transform</article-title>. <source>Bioinformatics</source> <volume>25</volume> (<issue>14</issue>), <fpage>1754</fpage>&#x2013;<lpage>1760</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btp324</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>McKenna</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hanna</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Banks</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Sivachenko</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Cibulskis</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kernytsky</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2010</year>). <article-title>The Genome Analysis Toolkit: a MapReduce Framework for Analyzing Next-Generation DNA Sequencing Data</article-title>. <source>Genome Res.</source> <volume>20</volume> (<issue>9</issue>), <fpage>1297</fpage>&#x2013;<lpage>1303</lpage>. <pub-id pub-id-type="doi">10.1101/gr.107524.110</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Menegidio</surname>
<given-names>F. B.</given-names>
</name>
<name>
<surname>Jabes</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Costa de Oliveira</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Nunes</surname>
<given-names>L. R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Dugong: a Docker Image, Based on Ubuntu Linux, Focused on Reproducibility and Replicability for Bioinformatics Analyses</article-title>. <source>Bioinformatics</source> <volume>34</volume> (<issue>3</issue>), <fpage>514</fpage>&#x2013;<lpage>515</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btx554</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Merkel</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Docker : Lightweight Linux Containers for Consistent Development and Deployment Docker: a Little Background under the Hood</article-title>. <source>Linux J.</source> <volume>2014</volume>, <fpage>2</fpage>&#x2013;<lpage>7</lpage>. </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mobley</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Linder</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Braeuer</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ellis</surname>
<given-names>L. M.</given-names>
</name>
<name>
<surname>Zwelling</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>A Survey on Data Reproducibility in Cancer Research Provides Insights into Our Limited Ability to Translate Findings from the Laboratory to the Clinic</article-title>. <source>PLoS ONE</source> <volume>8</volume>, <fpage>e63221</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0063221</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="book">
<collab>National Research Council</collab> (<year>2011</year>). <source>Toward Precision Medicine: Building a Knowledge Network for Biomedical Research and a New Taxonomy of Disease</source>
<italic>.</italic> <publisher-loc>Washington (DC)</publisher-loc>, <publisher-name>National Academies Press (US)</publisher-name>.</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Novella</surname>
<given-names>J.&#x20;A.</given-names>
</name>
<name>
<surname>Emami Khoonsari</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Herman</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Whitenack</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Capuccini</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Burman</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Container-based Bioinformatics with Pachyderm</article-title>. <source>Bioinformatics</source> <volume>35</volume> (<issue>5</issue>), <fpage>839</fpage>&#x2013;<lpage>846</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bty699</pub-id> </citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pedregosa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Varoquaux</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gramfort</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Michel</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Thirion</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Grisel</surname>
<given-names>O.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Scikit-learn: Machine Learning in Python</article-title>. <source>J.&#x20;machine Learn. Res.</source> <volume>12</volume>, <fpage>2825</fpage>&#x2013;<lpage>2830</lpage>. </citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Peng</surname>
<given-names>R. D.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Reproducible Research in Computational Science</article-title>. <source>Science</source> <volume>334</volume>, <fpage>1226</fpage>&#x2013;<lpage>1227</lpage>. <pub-id pub-id-type="doi">10.1126/science.1213847</pub-id> </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Peng</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>The Reproducibility Crisis in Science: A Statistical Counterattack</article-title>. <source>Significance</source> <volume>12</volume>, <fpage>30</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1111/j.1740-9713.2015.00827.x</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>P&#xe9;rez</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Granger</surname>
<given-names>B. E.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>IPython: a System for Interactive Scientific Computing</article-title>. <source>Comput. Sci. Eng.</source> <volume>9</volume> (<issue>3</issue>), <fpage>21</fpage>&#x2013;<lpage>29</lpage>. <pub-id pub-id-type="doi">10.1109/mcse.2007.53</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pertea</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Pertea</surname>
<given-names>G. M.</given-names>
</name>
<name>
<surname>Antonescu</surname>
<given-names>C. M.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>T.-C.</given-names>
</name>
<name>
<surname>Mendell</surname>
<given-names>J.&#x20;T.</given-names>
</name>
<name>
<surname>Salzberg</surname>
<given-names>S. L.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>StringTie Enables Improved Reconstruction of a Transcriptome from RNA-Seq Reads</article-title>. <source>Nat. Biotechnol.</source> <volume>33</volume> (<issue>3</issue>), <fpage>290</fpage>&#x2013;<lpage>295</lpage>. <pub-id pub-id-type="doi">10.1038/nbt.3122</pub-id> </citation>
</ref>
<ref id="B43">
<citation citation-type="web">
<collab>Picard</collab> (<year>2021</year>). <article-title>Picard</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://broadinstitute.github.io/picard/">http://broadinstitute.github.io/picard/</ext-link>
</comment> </citation>
</ref>
<ref id="B44">
<citation citation-type="web">
<collab>Python Software Foundation</collab> (<year>2021</year>). <article-title>Python Software Foundation</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://www.python.org">http://www.python.org</ext-link>
</comment> </citation>
</ref>
<ref id="B45">
<citation citation-type="book">
<collab>QIAGEN</collab> (<year>2021</year>). <source>QIAGEN Data Analysis Center</source>.</citation>
</ref>
<ref id="B46">
<citation citation-type="web">
<collab>R-Project</collab> (<year>2021</year>). <article-title>R: A Language and Environment for Statistical Computing</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://www.r-project.org/">http://www.r-project.org/</ext-link>
</comment>. </citation>
</ref>
<ref id="B47">
<citation citation-type="web">
<collab>RSQLite</collab> (<year>2021</year>). <article-title>SQLite&#x27; Interface for R</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://cran.r-project.org/web/packages/RSQLite/index.html">https://cran.r-project.org/web/packages/RSQLite/index.html</ext-link>
</comment>. </citation>
</ref>
<ref id="B48">
<citation citation-type="book">
<collab>RStudio</collab> (<year>2020</year>). &#x201c;<article-title>Integrated Development for R</article-title>,&#x201d; in <source>RStudio, PBC</source>. (<publisher-loc>Boston, MA</publisher-loc>: <publisher-name>RStudio Team</publisher-name>). </citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rupprecht</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Davis</surname>
<given-names>J.&#x20;C.</given-names>
</name>
<name>
<surname>Arnold</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gur</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Bhagwat</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Improving Reproducibility of Data Science Pipelines through Transparent Provenance Capture</article-title>. <source>Proc. VLDB Endow.</source> <volume>13</volume>, <fpage>3354</fpage>&#x2013;<lpage>3368</lpage>. <pub-id pub-id-type="doi">10.14778/3415478.3415556</pub-id> </citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sandve</surname>
<given-names>G. K.</given-names>
</name>
<name>
<surname>Nekrutenko</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Taylor</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hovig</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Ten Simple Rules for Reproducible Computational Research</article-title>. <source>Plos Comput. Biol.</source> <volume>9</volume>, <fpage>e1003285</fpage>&#x2013;<lpage>4</lpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1003285</pub-id> </citation>
</ref>
<ref id="B51">
<citation citation-type="web">
<collab>SQLite</collab> (<year>2021</year>). <article-title>SQLite</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.sqlite.org/index.html">https://www.sqlite.org/index.html</ext-link>
</comment> </citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tarasov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Vilella</surname>
<given-names>A. J.</given-names>
</name>
<name>
<surname>Cuppen</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Nijman</surname>
<given-names>I. J.</given-names>
</name>
<name>
<surname>Prins</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Sambamba: Fast Processing of NGS Alignment Formats</article-title>. <source>Bioinformatics</source> <volume>31</volume> (<issue>12</issue>), <fpage>2032</fpage>&#x2013;<lpage>2034</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btv098</pub-id> </citation>
</ref>
<ref id="B53">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Tukey</surname>
<given-names>J.&#x20;W.</given-names>
</name>
</person-group> (<year>1977</year>). <source>Exploratory Data Analysis, Vol. 2</source>. <publisher-loc>Reading</publisher-loc>: <publisher-name>Mass</publisher-name>.</citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vitting-Seerup</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Sandelin</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>IsoformSwitchAnalyzeR: Analysis of Changes in Genome-wide Patterns of Alternative Splicing and its Functional Consequences</article-title>. <source>Bioinformatics</source> <volume>35</volume> (<issue>21</issue>), <fpage>4469</fpage>&#x2013;<lpage>4471</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btz247</pub-id> </citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.-Y.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>C.-I.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>On the Low Reproducibility of Cancer Studies</article-title>. <source>Natl. Sci. Rev.</source> <volume>5</volume>, <fpage>619</fpage>&#x2013;<lpage>624</lpage>. <pub-id pub-id-type="doi">10.1093/nsr/nwy021</pub-id> </citation>
</ref>
<ref id="B56">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wurmus</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Uyar</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Osberg</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Franke</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Gosdschan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wreczycka</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>PiGx: Reproducible Genomics Analysis Pipelines with GNU Guix</article-title>. <source>Gigascience</source> <volume>7</volume> (<issue>12</issue>). <pub-id pub-id-type="doi">10.1093/gigascience/giy123</pub-id> </citation>
</ref>
<ref id="B57">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Padmanabhan</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>DiCarlo</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>smCounter2: an Accurate Low-Frequency Variant Caller for Targeted Sequencing Data with Unique Molecular Identifiers</article-title>. <source>Bioinformatics</source>. <volume>35</volume>(<issue>8</issue>):<fpage>1299</fpage>&#x2013;<lpage>1309</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bty790</pub-id> </citation>
</ref>
<ref id="B58">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Kibbe</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Cancer Data Science and Computational Medicine</article-title>. <source>JCO Clin. Cancer Inform.</source> <volume>5</volume>, <fpage>487</fpage>&#x2013;<lpage>489</lpage>. <pub-id pub-id-type="doi">10.1200/cci.21.00006</pub-id> </citation>
</ref>
<ref id="B59">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Meltzer</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Davis</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>RCircos: an R Package for Circos 2D Track Plots</article-title>. <source>BMC Bioinformatics</source> <volume>14</volume>, <fpage>244</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-14-244</pub-id> </citation>
</ref>
<ref id="B60">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>H. B.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>J.&#x20;T.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Automatic Method for Determining Cluster Number Based on Silhouette Coefficient</article-title>. <source>Adv. Mater. Res.</source> <volume>951</volume>, <fpage>227</fpage>&#x2013;<lpage>230</lpage>. </citation>
</ref>
</ref-list>
</back>
</article>