<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Microbiol.</journal-id>
<journal-title>Frontiers in Microbiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Microbiol.</abbrev-journal-title>
<issn pub-type="epub">1664-302X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmicb.2021.712886</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Microbiology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Prediction of Minimal Inhibitory Concentration of Meropenem Against <italic>Klebsiella pneumoniae</italic> Using Metagenomic Data</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Tan</surname> <given-names>Rundong</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x2020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1342133/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Yu</surname> <given-names>Anqi</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x2020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1422257/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Liu</surname> <given-names>Ziming</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Liu</surname> <given-names>Ziqi</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Jiang</surname> <given-names>Rongfeng</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Xiaoli</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Liu</surname> <given-names>Jialin</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Gao</surname> <given-names>Junhui</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1203935/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Xinjun</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Shanghai Biotecan Pharmaceuticals Co., Ltd.</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Shanghai Zhangjiang Institute of Medical Innovation</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Medical Information Engineering, Department of Medical Information, Harbin Medical University</institution>, <addr-line>Harbin</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>Department of Biostatistics, School of Global Public Health, New York University</institution>, <addr-line>New York, NY</addr-line>, <country>United States</country></aff>
<aff id="aff5"><sup>5</sup><institution>Department of Critical Care Medicine, Ruijin Hospital, School of Medicine, Shanghai Jiao Tong University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff6"><sup>6</sup><institution>Translational Medical Center for Stem Cell Therapy, Shanghai East Hospital, School of Medicine, Tongji University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Liang Wang, Institut Pasteur of Shanghai, Chinese Academy of Sciences (CAS), China</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Xin Liu, Xuzhou Medical University, China; Weidong Qian, Shaanxi University of Science and Technology, China</p></fn>
<corresp id="c001">&#x002A;Correspondence: Jialin Liu, <email>liujialin77@163.com</email></corresp>
<corresp id="c002">Junhui Gao, <email>Jhgao68@163.com</email></corresp>
<fn fn-type="other" id="fn002"><p><sup>&#x2020;</sup>These authors have contributed equally to this work and share first authorship</p></fn>
<fn fn-type="other" id="fn004"><p>This article was submitted to Systems Microbiology, a section of the journal Frontiers in Microbiology</p></fn>
</author-notes>
<pub-date pub-type="epub">
<day>23</day>
<month>08</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>12</volume>
<elocation-id>712886</elocation-id>
<history>
<date date-type="received">
<day>21</day>
<month>05</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>07</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2021 Tan, Yu, Liu, Liu, Jiang, Wang, Liu, Gao and Wang.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Tan, Yu, Liu, Liu, Jiang, Wang, Liu, Gao and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Minimal inhibitory concentration (MIC) is defined as the lowest concentration of an antimicrobial agent that can inhibit the visible growth of a particular microorganism after overnight incubation. Clinically, antibiotic doses for specific infections are determined according to the fraction of MIC. Therefore, credible assessment of MICs will provide a physician valuable information on the choice of therapeutic strategy. Early and precise usage of antibiotics is the key to an infection therapy. Compared with the traditional culture-based method, the approach of whole genome sequencing to identify MICs can shorten the experimental time, thereby improving clinical efficacy. <italic>Klebsiella pneumoniae</italic> is one of the most significant members of the genus <italic>Klebsiella</italic> in the Enterobacteriaceae family and also a common non-social pathogen. Meropenem is a broad-spectrum antibacterial agent of the carbapenem family, which can produce antibacterial effects of most Gram-positive and -negative bacteria. In this study, we used single-nucleotide polymorphism (SNP) information and nucleotide <italic>k</italic>-mers count based on metagenomic data to predict MICs of meropenem against <italic>K. pneumoniae</italic>. Then, features of 110 sequenced <italic>K. pneumoniae</italic> genome data were combined and modeled with XGBoost algorithm and deep neural network (DNN) algorithm to predict MICs. We first use the XGBoost classification model and the XGBoost regression model. After five runs, the average accuracy of the test set was calculated. The accuracy of using nucleotide <italic>k</italic>-mers to predict MICs of the XGBoost classification model and XGBoost regression model was 84.5 and 89.1%. The accuracy of SNP in predicting MIC was 80 and 81.8%, respectively. The results show that XGBoost regression is better than XGBoost classification in both nucleotide <italic>k</italic>-mers and SNPs to predict MICs. We further selected 40 nucleotide <italic>k</italic>-mers and 40 SNPs with the highest correlation with MIC values as features to retrain the XGBoost regression model and DNN regression model. After 100 and 1,000 runs, the results show that the accuracy of the two models was improved. The accuracy of the XGBoost regression model for <italic>k</italic>-mers, SNPs, and <italic>k</italic>-mers &#x0026; SNPs was 91.1, 85.2, and 91.3%, respectively. The accuracy of the DNN regression model was 91.9, 87.1, and 91.8%, respectively. Through external verification, some of the selected features were found to be related to drug resistance.</p>
</abstract>
<kwd-group>
<kwd><italic>Klebsiella pneumoniae</italic></kwd>
<kwd>minimum inhibitory concentration</kwd>
<kwd>meropenem</kwd>
<kwd>XGBoost</kwd>
<kwd>deep neural network</kwd>
</kwd-group>
<counts>
<fig-count count="7"/>
<table-count count="3"/>
<equation-count count="0"/>
<ref-count count="26"/>
<page-count count="10"/>
<word-count count="0"/>
</counts>
</article-meta>
</front>
<body>
<sec id="S1">
<title>Introduction</title>
<p><italic>Klebsiella pneumoniae</italic> is a member of thew enterobacter <italic>Klebsiella</italic>; it is a Gram-negative bacterium that causes one-third of all Gram-negative infections (<xref ref-type="bibr" rid="B14">Navon-Venezia et al., 2017</xref>). Over the past two decades, <italic>K. pneumoniae</italic> has undergone complex evolution, with the emergence of many high-risk, highly infectious sequence types, resulting in the sustained global spread of <italic>K. pneumoniae</italic> (<xref ref-type="bibr" rid="B14">Navon-Venezia et al., 2017</xref>). In addition to widespread transmission, the increase in drug resistance in <italic>K. pneumoniae</italic> is also an important issue. Many studies and reports indicate that antimicrobial resistance (AMR) strains of <italic>K. pneumoniae</italic> have increased at an alarming rate in recent years (<xref ref-type="bibr" rid="B12">Long et al., 2017</xref>; <xref ref-type="bibr" rid="B14">Navon-Venezia et al., 2017</xref>).</p>
<p>Carbapenem antibiotics play an important role in the treatment of severe infections of drug-resistant Enterobacteriaceae, and the increase of drug resistance of <italic>K. pneumoniae</italic> and the emergence and spread of drug-resistant strains pose a serious threat to public health (<xref ref-type="bibr" rid="B22">Spagnolo et al., 2014</xref>). In fact, carbapenem antibiotic resistance in <italic>K. pneumoniae</italic> has emerged many years ago and has spread widely around the world (<xref ref-type="bibr" rid="B22">Spagnolo et al., 2014</xref>). Recent studies have shown that the resistance rates of <italic>K. pneumoniae</italic> to aztreonam, ceftazidime, ciprofloxacin, cefotaxime, cefepime and imipenem are more than 50% (<xref ref-type="bibr" rid="B6">Effah et al., 2020</xref>). Meropenem has good <italic>in vitro</italic> anti-<italic>K. pneumoniae</italic> properties and is likely to have optimal bactericidal efficacy for the treatment of <italic>K. pneumoniae</italic> (<xref ref-type="bibr" rid="B1">Baldwin et al., 2008</xref>).</p>
<p>Meropenem belongs to the carbapenem class of antibiotics and is one of the widely used antibiotics for the treatment of <italic>K. pneumoniae</italic> infections, with broad-spectrum <italic>in vitro</italic> resistance to both Gram-positive and Gram-negative pathogens (<xref ref-type="bibr" rid="B14">Navon-Venezia et al., 2017</xref>). It readily penetrates the cell walls of most Gram-negative and -positive bacteria to reach its target penicillin-binding protein (PBPS) and exhibits stability to hydrolysis by most &#x03B2;-lactamases, including penicillinases and cephalosporinases produced by Gram-positive and Gram-negative bacteria (<xref ref-type="bibr" rid="B14">Navon-Venezia et al., 2017</xref>).</p>
<p>In addition to the selection of antimicrobial agents, the timing and dosage of effective antimicrobial agents are also very important. In general, treatment is most effective when effective antibiotics are administered early. In a study of patients with infectious shock, there was a strong relationship between time to effective antimicrobial drug onset and in-hospital mortality (corrected ratio 1.119 per hour delay) (<xref ref-type="bibr" rid="B19">Pesesky et al., 2016</xref>). Neither too high nor too low a dose of antibiotics is the optimal treatment regimen: too high may result in increased resistance to <italic>K. pneumoniae</italic>, and too low will not achieve the desired effect of treatment with antibiotics. The minimum inhibitory concentration (MIC) indicates the appropriate dosage of antibiotics. MIC is an important index to measure both the effectiveness of antimicrobial agents and bacterial resistance to drugs.</p>
<p>Treatment with the optimal dose of effective antibiotics as soon as possible after the infection is the key to curing <italic>K. pneumoniae</italic> infection. Therefore, the time required to determine the MIC is an important factor to determine whether antibiotics can be used in the early stage of infection. There are many traditional methods of MIC determination, such as spatial gas chromatography methods for antimicrobial screening, electronic testing methods, and traditional petri dish measurement methods. However, traditional methods often take 18 to 24 h or even more. In order to meet the demand for antibiotic therapy, we need to find newer, faster, and more accurate techniques for detecting the MIC of antibiotics.</p>
<p>In recent years, many researchers used machine learning methods to build models that can predict MIC value more quickly and accurately (<xref ref-type="bibr" rid="B9">Li et al., 2016</xref>, <xref ref-type="bibr" rid="B10">2017</xref>; <xref ref-type="bibr" rid="B7">Eyre et al., 2017</xref>; <xref ref-type="bibr" rid="B15">Nguyen et al., 2018</xref>; <xref ref-type="bibr" rid="B18">Pataki et al., 2020</xref>). These papers presented the methods and models that were used to predict the MICs of <italic>K. pneumoniae</italic> (<xref ref-type="bibr" rid="B15">Nguyen et al., 2018</xref>), antibiotic moldus of <italic>Neisseria gonorrhoeae</italic> (<xref ref-type="bibr" rid="B7">Eyre et al., 2017</xref>), <italic>Streptococcus pneumoniae</italic> (<xref ref-type="bibr" rid="B9">Li et al., 2016</xref>), non-typhoid <italic>Salmonella</italic> (<xref ref-type="bibr" rid="B16">Nguyen et al., 2019</xref>), and <italic>Escherichia coli</italic> (<xref ref-type="bibr" rid="B18">Pataki et al., 2020</xref>).</p>
<p>A previous study has built XGBoost machine learning models to predict MICs for a comprehensive population-based collection of clinical isolates of <italic>K. pneumoniae</italic>, which was able to rapidly predict MICs for 20 antibiotics with an average accuracy of 92% (<xref ref-type="bibr" rid="B15">Nguyen et al., 2018</xref>). According to this, our study is dedicated to constructing models that can predict MICs for Meropenem treatment of <italic>K. pneumoniae</italic> more accurately and analyzing features that are highly correlated with MIC prediction and externally validating these features.</p>
<p>In this study, we first obtained single-nucleotide polymorphism (SNP) information and nucleotide <italic>k</italic>-mers (<italic>k</italic> = 6, 8, 10) counting information based on metagenomic data of <italic>K. pneumoniae</italic> sequence analysis and then trained the dataset with three machine learning and deep learning methods &#x2013; XGBoost classification method, XGBoost regression method, and deep neural network (DNN) regression method &#x2013; and finally compare the prediction results of the three methods and select the features that are highly related to MIC to construct a new prediction model to achieve higher prediction accuracy.</p>
</sec>
<sec id="S2" sec-type="materials|methods">
<title>Materials and Methods</title>
<sec id="S2.SS1">
<title>Data Collection</title>
<p>Two types of data were included in our study: <italic>K. pneumoniae</italic> metagenomic sequences, and the related MIC values of the antibiotic meropenem. The metagenomic data were pre-processed as tables of <italic>k</italic>-mers and SNPs for further model construction and prediction. Sequenced <italic>K. pneumoniae</italic> genome data used in this study can be downloaded <italic>via</italic> BioProject with access numbers <ext-link ext-link-type="DDBJ/EMBL/GenBank" xlink:href="PRJNA376414">PRJNA376414</ext-link>, <ext-link ext-link-type="DDBJ/EMBL/GenBank" xlink:href="PRJNA386693">PRJNA386693</ext-link>, and <ext-link ext-link-type="DDBJ/EMBL/GenBank" xlink:href="PRJNA396774">PRJNA396774</ext-link>. We collected data related to the antibiotic meropenem with complete sequence information and correct scaffold assembly, and finally, the 110 genome was involved in the study. The SRA access number for each genome is shown in the supplementary table.</p>
<p>HS11286<sup><xref ref-type="fn" rid="footnote1">1</xref></sup> was selected to be our reference genome for SNP calling. The table file with SRA ID and MIC values for meropenem was downloaded from the supplementary materials attached from <xref ref-type="bibr" rid="B15">Nguyen et al. (2018)</xref>.</p>
<p>For sequence data, the fastq-dump tool SRA Toolkit was used (with -I &#x2013;split-files parameters). SPAdes (<xref ref-type="bibr" rid="B2">Bankevich et al., 2012</xref>) was then used to (with &#x2212;1, &#x2212;2 and -o parameters) assemble the pair the end sequence for each sample. Finally, the assembled scaffold.fasta files were mapped to the reference genome to obtain <italic>k</italic>-mers and SNP information.</p>
</sec>
<sec id="S2.SS2">
<title>Data Pre-processing</title>
<sec id="S2.SS2.SSS1">
<title>Nucleotide <italic>k</italic>-mers</title>
<p>In the study, 110 assembled genome scaffold files were processed to produce matrices of <italic>k</italic>-mers features. For each genome, we cut the scaffold sequences starting from the first nucleotide with 6-, 8-, and 10-nucleotide window lengths, respectively. For the following cuts, starting points of the windows move forward with one nucleotide each time until the sequence ends. Finally, a matrix with 110 rows and 559,494 columns of 6, 8, and 10 length nucleotide fragments were created for model training.</p>
</sec>
<sec id="S2.SS2.SSS2">
<title>Calling SNPs</title>
<p>According to studies by <xref ref-type="bibr" rid="B24">Yang et al. (2018</xref>, <xref ref-type="bibr" rid="B25">2019)</xref>, SNPs resistant to <italic>Mycobacterium tuberculosis</italic> were used as features for prediction.</p>
<p>We extracted SNPs from the whole gene to find the resistant SNPs. For SNP calling, the raw 110 <italic>K. pneumoniae</italic> metagenomic samples were mapped to the HS11286 (&#x201C;see text footnote 1&#x201D;) reference genome with single end reads mode, and then reads of the 110 genome samples were mapped to the reference genome using samtoolsv1.9 (<xref ref-type="bibr" rid="B3">Bonfield et al., 2021</xref>) and resulting in 110.vcf files. Further filtering was conducted using bcftools v1.10 (<xref ref-type="bibr" rid="B8">Li, 2011</xref>) (with parameters %QUAL &#x2265; 50 &#x0026; DP &#x2265; 20). Finally, a combined matrix of the combined SNPs with 110 rows and 164,138 columns was obtained. The columns of the matrix represent the concatenation of the SNP positions compared to the reference genome, where a sample with a mutation at that position was marked as 1 and those without mutations were marked as 0.</p>
</sec>
</sec>
<sec id="S2.SS3">
<title>EXtreme Gradient Boosting (XGBoost) Model Development</title>
<sec id="S2.SS3.SSS1">
<title>XGBoost</title>
<p>EXtreme Gradient Boosting (XGBoost) algorithm is an optimized distributed implementation of gradient boosted decision trees, designed for computational speed and higher performance. Since its initial release in 2014 (<xref ref-type="bibr" rid="B4">Chen and Guestrin, 2016</xref>), in the past few years, XGBoost has been applied to a number of biomedical problems.</p>
<p>As an implement machine learning algorithm under the gradient boosting framework, the starting point of XGBoost is decision trees. However, here, each tree is fitted to the residuals (prediction errors) of the previous tree in order to gradually minimize the deviations between the model and the observed target data. This is done by giving more weight to the poorly modeled cases. In contrast to the Random Forest model, the trees are thus not independent of each other. Besides the different random samples, this is additionally achieved by the fact that not all predictors are available for selection at each branching, but only a randomly chosen subset, and get exceptionally high performance for regression as well as classification tasks. Classification trees are used to identify the class/category within which the input variables would most likely fall, while regression algorithms are suitable for continuous variables, and the tree is used to predict the value.</p>
<p>XGBoost algorithm has gradient boosting at its core. However, unlike simple gradient boosting algorithms, the XGboost model takes a parallelization approach in the process of sequential addition of the weak learners, whereby proper utilization of the CPU core of the machine is utilized, leading to greater speed and performance (<xref ref-type="bibr" rid="B21">Santhanam, 2016</xref>). Moreover, it is a distributed and scalable computing method that is available for large datasets.</p>
<p>Moreover, one benefit of the gradient boosting model is that for different loss functions, new algorithms are not required to be derived; it is enough that a suitable loss function be chosen and then incorporated with the gradient boosting framework.</p>
</sec>
<sec id="S2.SS3.SSS2">
<title>Model Training</title>
<p>We used XGBoost to train both classification and regression models, respectively; several predict models were built depending on data type.</p>
<p>For <italic>k</italic>-mers data, the occurrence times of each <italic>k</italic>-mer in each sample were counted, and we used all possible segments as features and mapped the number of <italic>k</italic>-mers to [0, 1] with Min&#x2013;Max normalization. For SNPs data, features were characterized by binary number as zeros and ones of all mutation sites. The data were divided into training and test set as 8:2.</p>
<p>Our XGBoost models were set as tree-based structure (with <bold><italic>booster</italic> = <italic>&#x201C;gbtree&#x201D;</italic></bold>), and GridsearchCV was applied for hyperparameter tuning. In order to prevent the XGBoost training process from generating too many trees, which causes the machine learning model to eventually overfit, we use fivefold cross-validation to select the most appropriate number of iterations; the value of booster_round is used as the num of XGBoost booster_round parameter, which is brought into the model training. Also, considering that our dataset is on the small side, using cross-validation also allows training with as much data as possible.</p>
<p>We first trained the XGboost multi-classification model, with the objective parameter <bold><italic>Multi: Softmax</italic></bold>. Input samples are fed into the generated XGBoost tree, and the leaf to which the sample belongs is found in each tree; the belonging weight is then added to obtain the predictions. As it is a multiclass classification model, we set 17 categories as classification labels to train the model, with a minimum MIC value of 0 and a maximum MIC value of 16, equally divided into 17 intervals. The prediction results are obtained by the <bold><italic>softmax</italic></bold> function, as probabilities of belonging to a certain MIC interval. For the regression model, the objective parameter of XGboost is <bold><italic>Reg: Gamma</italic></bold>, as MIC values can be regarded as gamma-distributed. The MICs of each sample were used as label of model training.</p>
<p>To prevent the XGBoost training process from generating too many trees and causing the machine learning model to be overfitted, we use fivefold cross-validation to find the most appropriate number of iterations (<bold><italic>num _booster_round</italic> = <italic>&#x201C;2000&#x201D;</italic></bold>) to the model training. In addition, using cross-validation also allows us to use as much data as possible for training, considering our small dataset. Also, the maximum depth of the tree, <bold><italic>max_depth</italic></bold>, was set to 6, and the proportion of random sampling, <bold><italic>subsample</italic></bold>, is 0.6.</p>
<p>The accuracy of the model was determined by the absolute value of the difference between the log2-transform of the predicted values and the true values.</p>
</sec>
</sec>
<sec id="S2.SS4">
<title>DNN Model Development</title>
<sec id="S2.SS4.SSS1">
<title>DNN</title>
<p>Deep learning is a concept for an approach to artificial intelligence called neural networks, and the DNN model is a basic deep learning framework. As a particular class of artificial neural networks with fully connected architecture, between the input and the output layer, there is an arbitrary number of hidden layers (<xref ref-type="bibr" rid="B26">Zador, 2019</xref>).</p>
<p>In principle, neural networks usually consist of four components: The input layer, the hidden layer(s), the output layer, and edges that connect the individual layers. More precisely, the edges connect individual nodes within the layers, whereby each transfer functions as a kind of container for a numerical value. The edges between the nodes have weights that define how the input is calculated across the edge to the next node. The arrangement of these components depends on the type and purpose of the network. Thus, the main difference between DNN and classical machine learning methods is the ability to process unstructured data through artificial neural networks (<xref ref-type="bibr" rid="B5">Dargan et al., 2019</xref>).</p>
</sec>
<sec id="S2.SS4.SSS2">
<title>Model Training</title>
<p>To further improve the performance of MIC prediction, we assessed the importance of <italic>k</italic>-mers and SNPs, respectively, based on the previous XGBoost model. We ranked all <italic>k</italic>-mers and SNP features using f-score as standard, and we found that the f-score values of <italic>k</italic>-mers and SNP features that were ranked in top 40 were greater than 1, while the others were not that significant. Thus, for the DNN method, the top 40 most important <italic>k</italic>-mers and SNPs were selected as features for the deep learning-based modeling. We established the following three models to predict MIC value: <italic>k</italic>-mers model, SNPs model, and <italic>k</italic>-mers &#x0026; SNPs model. Our overall work flow of MIC prediction modeling is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption><p>Work flow of MIC prediction modeling.</p></caption>
<graphic xlink:href="fmicb-12-712886-g001.tif"/>
</fig>
<p>The DNN model with <italic>k</italic>-mers and SNP inputs uses a <bold><italic>Dense</italic></bold> neural network framework, where the top 40 most important features for predicting MIC values are fed into a 128-unit Dense layer with a <bold><italic>relu</italic></bold> activation function to train the DNN model. Similarly, on the test set, the absolute value of the difference between the log2 transform of the predicted value and the true value is used as the basis for assessing the accuracy of the model.</p>
<p>In particular, for the <italic>k</italic>-mers &#x0026; SNPs input, we use a combined Dense + LSTM model frame. More specifically, for the top 40 characteristic <italic>k</italic>-mers data selected by the previous model, input the Dense layer and then input the selected top 40 feature data from the SNP site into the LSTM layer. The Dense layer and the LSTM layer are combined as the model input to train the DNN model.</p>
</sec>
</sec>
</sec>
<sec id="S3">
<title>Results</title>
<p>We first used the XGBoost classification model and made five predictions using KMER (110 samples <sup>&#x2217;</sup> 559,494 <italic>k</italic>-mers features) and SNPs (110 samples <sup>&#x2217;</sup> 164,138 SNPs features) data. For each experiment, we set different random states from 1 to 5. Similarly, the XGBoost regression model was used to make five times predictions for both <italic>k</italic>-mers and SNPs data. The random states parameter was taken from 1 to 5 in order to maintain consistency in the splitting of the dataset for comparative analysis of the results. A comparison of the prediction accuracies of the models was then performed. The Boxplot grouping in <xref ref-type="fig" rid="F2">Figure 2</xref> shows the accuracy values for each of the five predictions, and <xref ref-type="table" rid="T1">Table 1</xref> shows their mean accuracy. From these results, it is clear that the XGBoost regression model predicts better than the classification model, for both <italic>k</italic>-mers and SNPs data. In addition, in terms of the input feature type, XGBoost predicted <italic>k</italic>-mers data with better accuracy than SNPs, possibly related to the fact that SNPs is a binary input of 0 and 1. The mean predictive accuracy of the XGBoost classification model for SNPs was 0.8, while the mean accuracy of the XGBoost regression model for <italic>k</italic>-mers reached 0.8909091.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption><p>Boxplots with jittered data points of XGBoost prediction accuracies for all features. It can be seen that the results of XGBoost regression are better than the classification and that XGBoost performed better with the <italic>k</italic>-mers characteristics than it with SNPs.</p></caption>
<graphic xlink:href="fmicb-12-712886-g002.tif"/>
</fig>
<table-wrap position="float" id="T1">
<label>TABLE 1</label>
<caption><p>Mean prediction accuracies of the XGBoost algorithm using all features of <italic>k</italic>-mers or SNPs (five times).</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left"><bold>XGBoost</bold></td>
<td valign="top" align="center"><bold><italic>k</italic>-mers</bold></td>
<td valign="top" align="center"><bold>SNPs</bold></td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Classification</td>
<td valign="top" align="center">0.845</td>
<td valign="top" align="center">0.800</td>
</tr>
<tr>
<td valign="top" align="left">Regression</td>
<td valign="top" align="center">0.891</td>
<td valign="top" align="center">0.818</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The top 10 important features of the classification and regression models with <italic>k</italic>-mers and SNPs data were statistically analyzed, respectively, and presented in the bar chart in <xref ref-type="fig" rid="F3">Figure 3</xref>. As can be seen from the figure, the top 10 features of the five attempts did not completely coincide, but some common features can be found. For example, for <italic>k</italic>-mers&#x2019; classification model, CGACAGTCTC appears in all five runs, GACTCCTAGC appears four times in <italic>k</italic>-mers&#x2019; regression model, and A2872728 and G17357 also appear four times each in SNPs&#x2019; regression model.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption><p>Horizontal barplots for the top 10 important features of XGBoost classification and regression models.</p></caption>
<graphic xlink:href="fmicb-12-712886-g003.tif"/>
</fig>
<p>To further optimize the model, the <italic>k</italic>-mers and SNPs top 40 feature datasets were taken for modeling and prediction by XGBoost regression and DNN regression, respectively. In order to enhance the reliability of the results, we used the XGBoost regression algorithm to model and predict all the features of <italic>k</italic>-mers and SNPs for another five times (the random_state parameter of the train_test_split function was taken from 6 to 10), and we also took their top 40 feature datasets for the XGBoost regression and DNN regression modeling. The top 40 feature datasets were also taken for the XGBoost regression and DNN regression modeling predictions.</p>
<p>Next, we ran the XGBoost regression model 10 times, and for the top 40 feature dataset for each experiment, we ran the XGBoost regression prediction 10 times (random states from 1 to 10). The Boxplot grouping in <xref ref-type="fig" rid="F4">Figure 4</xref> shows the accuracy values for each of the 100 predictions, and <xref ref-type="table" rid="T2">Table 2</xref> tallies their mean values. The XGBoost regressions for <italic>k</italic>-mers, SNPs, and <italic>k</italic>-mers &#x0026; SNPs data had prediction accuracies of 0.9113636, 0.8522727, and 0.9127273, with the lowest predictive accuracy for SNPs and the best for <italic>k</italic>-mers &#x0026; SNPs. Overall, the XGBoost regression model predicted the top 40 feature dataset better than the predictions for all feature datasets, for both <italic>k</italic>-mers and SNPs (<xref ref-type="table" rid="T1">Tables 1</xref>, <xref ref-type="table" rid="T2">2</xref>). We show the <italic>y</italic>-test and <italic>y</italic> predicted values for all 100 predictions and see that the predicted values largely fluctuate around the true values (<xref ref-type="fig" rid="F5">Figure 5</xref>).</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption><p>Boxplots with jittered data points of XGBoost prediction accuracies for top 40 features. Since 10 &#x00D7; 10 = 100 modeling predictions were made, the results of 100 predictions in each box could be seen in XGBoost&#x2019;s comparison with <italic>k</italic>-mers, SNPs, and <italic>k</italic>-mers &#x0026; SNPs.</p></caption>
<graphic xlink:href="fmicb-12-712886-g004.tif"/>
</fig>
<table-wrap position="float" id="T2">
<label>TABLE 2</label>
<caption><p>Mean prediction accuracies of the XGBoost algorithm using top 40 features of <italic>k</italic>-mers or/and SNPs (10 &#x00D7; 10 times).</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left"><bold>XGBoost (Top 40)</bold></td>
<td valign="top" align="center"><bold><italic>k</italic>-mers</bold></td>
<td valign="top" align="center"><bold>SNPs</bold></td>
<td valign="top" align="center"><bold><italic>k</italic>-mers &#x0026; SNPs</bold></td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Regression</td>
<td valign="top" align="center">0.911</td>
<td valign="top" align="center">0.852</td>
<td valign="top" align="center">0.913</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption><p>Scatter plots of true test values and predicted values of MIC using XGBoost algorithm for top 40 features (left for <italic>k</italic>-mers, middle for SNPs, and right for <italic>k</italic>-mers &#x0026; SNPs). As the original <italic>y</italic>-value was discrete, several horizontal lines were presented in the figure. The predicted values were clustered around these lines&#x2019;; RMSE and <italic>R</italic><sup>2</sup> values were also calculated and shown at the top of the figure.</p></caption>
<graphic xlink:href="fmicb-12-712886-g005.tif"/>
</fig>
<p>Similarly, for the DNN model, the top 10 important features selected by XGBoost were trained for a total of 100 times of random resolution, respectively. The Boxplot grouping in <xref ref-type="fig" rid="F6">Figure 6</xref> shows the accuracy values of 1,000 times of prediction, and their average values are calculated in <xref ref-type="table" rid="T3">Table 3</xref>, and the test and predicted values for all 1,000 predictions are shown in <xref ref-type="fig" rid="F7">Figure 7</xref>. Regressions for <italic>k</italic>-mers, SNPs, and <italic>k</italic>-mers &#x0026; SNPs had prediction accuracies of 0.9189091, 0.8705455, and 0.9177273, respectively, with the lowest prediction accuracy for SNPs and very similar prediction accuracies for <italic>k</italic>-mers and <italic>k</italic>-mers &#x0026; SNPs, all of which were relatively high.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption><p>Boxplots with jittered data points of DNN prediction accuracies for top 40 features (100 &#x00D7; 10 times training).</p></caption>
<graphic xlink:href="fmicb-12-712886-g006.tif"/>
</fig>
<table-wrap position="float" id="T3">
<label>TABLE 3</label>
<caption><p>Mean prediction accuracies of the DNN algorithm using top 40 features of <italic>k</italic>-mers or/and SNPs (100 &#x00D7; 10 times training).</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left"><bold>DNN (Top 40)</bold></td>
<td valign="top" align="center"><bold><italic>k</italic>-mers</bold></td>
<td valign="top" align="center"><bold>SNPs</bold></td>
<td valign="top" align="center"><bold><italic>k</italic>-mers &#x0026; SNPs</bold></td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Regression</td>
<td valign="top" align="center">0.919</td>
<td valign="top" align="center">0.871</td>
<td valign="top" align="center">0.918</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption><p>Scatter plots of true test <italic>y</italic>-values and predicted <italic>y</italic>-values using DNN algorithm for top 40 features (100 &#x00D7; 10 times training). The predicted values were clustered around these lines; RMSE and <italic>R</italic><sup>2</sup> values were also calculated and shown at the top of the figure.</p></caption>
<graphic xlink:href="fmicb-12-712886-g007.tif"/>
</fig>
<p>For regression models, the mean square root of the error between the predicted and true values (RMSE) is usually used as a model evaluation metric, and the coefficient of determination (<italic>R</italic><sup>2</sup>) is used to indicate how well the model predicts the true value compared to the mean value model. We calculated the RMSE and <italic>R</italic><sup>2</sup> values of our XGBoost and DNN models. For our XGBoost models, the RMSE values were 1.734, 2.781, and 1.717, and <italic>R</italic><sup>2</sup> values were 0.860, 0.640, and 0.863, respectively (<xref ref-type="fig" rid="F5">Figure 5</xref>). The RMSEs of the DNN models were 1.955, 2.179, and 2.045 and the <italic>R</italic><sup>2</sup> values of the DNN model were 0.836, 0.796, and 0.820 (<xref ref-type="fig" rid="F7">Figure 7</xref>). <italic>R</italic><sup>2</sup> is an indicator used in regression models to evaluate the degree of agreement between the predicted value and the actual value, with a maximum value of 1. It can be seen that, overall, our models fit well.</p>
<p>In summary, our analysis showed that the XGBoost classification model reached over 80% prediction accuracy, and the model with <italic>k</italic>-mers data gave better results than SNPs inputs. Compared with the XGBoost classification model, the overall performance of the XGBoost regression model is improved (89.1 and 81.8% for <italic>k</italic>-mers and SNPs data, respectively). The MIC value is continuously distributed, and the effect of the regression model may be more realistic. DNN neural network models perform better in predicting MIC values with improved overall accuracy compared to XGBoost models. On the other hand, the <italic>k</italic>-mers and SNPs top 40 feature dataset was sufficient to obtain good prediction results (above 85% accuracy), with <italic>k</italic>-mers and mixed <italic>k</italic>-mers &#x0026; SNPs features performing well and the DNN regression model performing better than the XGBoost regression approach.</p>
</sec>
<sec id="S4">
<title>Discussion</title>
<p>Based on metagenomic data, in this study, sequence analysis was used to obtain SNPs information and nucleotide <italic>k</italic>-mers count information queue data; machine learning and deep learning methods were then applied to establish a prediction model for the MIC value of <italic>K. pneumoniae</italic>. By feature selection, we proposed a top 40 feature-based regression model, which had the best predictive performance of 91%.</p>
<p>First, according to <xref ref-type="bibr" rid="B13">Naha et al. (2021)</xref> and <xref ref-type="bibr" rid="B17">Okanda et al. (2020)</xref>, we found that gene mutations may affect drug resistance of <italic>Klebsiella</italic>; thus, we tried to find the relevant sites affecting resistance by calling SNPs. After pre-processing the raw data by using biogenetics tools BWA, BCFTools, and SamTools, we obtained a matrix of mutation site and sample list. We took the mutated gene site as the features and built the machine learning model of classification and regression, respectively. We used 110 samples for prediction, and the prediction results above show that the mean accuracy of the SNPs classification model was 80% and the mean accuracy of the SNPs regression model was 81.81%, which shows that the performance of the regression model is better than the multi-classification model. Then, based on the method previously described by <xref ref-type="bibr" rid="B16">Nguyen et al. (2019)</xref>, we created both XGBoost classification and regression models using <italic>k</italic>-mers counts as input features, respectively, and made MIC predictions for 110 samples. As described above, after five runs, we obtained a mean accuracy of 84.54% for the <italic>k</italic>-mers classification model and 89.09% accuracy for the <italic>k</italic>-mers regression model. This result again shows that the multi-classification model does not perform as well as the regression model. In addition, the prediction of MIC values using SNPs loci was less effective than that of <italic>k</italic>-mers prediction, which may be due to the fact that the input to the SNPs is binary data with only mutated (labeled as 1) and unmutated (labeled as 0) features, while the input to the <italic>k</italic>-mers counting model are continuous variables, making it more effective for regression model training.</p>
<p>To evaluate our model, we compared MIC prediction models built by related studies. In the study by <xref ref-type="bibr" rid="B23">ValizadehAslani et al. (2020)</xref>, the authors used the XGBoost model with <italic>k</italic>-mers features, and the result shows an accuracy of around 91% in predicting the MIC value of meropenem against <italic>K. pneumoniae</italic>, which was close to our results. Another study by <xref ref-type="bibr" rid="B16">Nguyen et al. (2019)</xref> also used the XGBoost model to predict MICs for non-typhoidal <italic>Salmonella</italic>, resulting in an average accuracy of 90% without a large number of samples. We decided to try more advanced deep learning approach for prediction. As the K-mers and SNPs had too many feature values, and the neural network could not accept features with too high dimensions, we selected some of important features as the training data to avoid overfitting.</p>
<p>The XGBoost regression model gives a score of importance for each feature during the training process. We selected the top 40 highest scores from the <italic>k</italic>-mers and the SNPs regression model, respectively, and then we used these total 80 important features as a new dataset, to predict MIC values using both XGBoost and DNN algorithms. In consideration of training time and server capacity, we only use regression models for prediction.</p>
<p>Comparing the results in <xref ref-type="table" rid="T2">Tables 2</xref>, <xref ref-type="table" rid="T3">3</xref>, the DNN model performs better than the classical XGboost machine learning approach in predicting MIC values, with a slight improvement in both accuracy rates. However, the reason for the small improvement may be due to the fact that only important features were selected for training and the overall amount of sample size was relatively small. In addition, the prediction accuracy of the model improved by combining the significant features of <italic>k</italic>-mers and SNPs to produce a new dataset than training with a single type of feature.</p>
<p>We found the annotated.gff file of the reference genome from NCBI and the paper on the whole gene analysis of the reference genome HS11286 by the team of Liu (<xref ref-type="bibr" rid="B11">Liu et al., 2012</xref>); the <italic>K. pneumoniae</italic> resistance genes were found from this paper and we identified loci belonging to these gene fragments from important features in the SNPs model. The pKPHS3 was mentioned in the study (<xref ref-type="bibr" rid="B11">Liu et al., 2012</xref>) as possessing 13 important resistance determinants, such as tetG, cat, sul1, dfra12, aac(3)-Ia, and aph. Genes were found among the important features of our SNPs, such as site T37808, which belongs to the tetG gene family, an important gene family that influences tetracycline resistance. This demonstrates that the important feature values obtained from our model training may help us to understand the reasons for the development of resistance, and why there are anti-tetracycline resistance genes present due to the presence of tra isoconjugate transfer genes in pKPHS2 and pKPHS3, which is the type of gene that causes resistance to spread between genera (<xref ref-type="bibr" rid="B11">Liu et al., 2012</xref>). Moreover, meropenem belongs to the class of beta-lactam antibiotics, which are classified as carbapenems. According to <xref ref-type="bibr" rid="B20">Reyes et al. (2019)</xref>, the most common resistance mechanism of <italic>K. pneumoniae</italic> to carbapenem antibiotics is the production of enzymes with carbapenemase activity, which hydrolyze beta-lactam antibiotics, while we also identified mutations in the beta-lactamase gene from important features in SNPs models, such as C1114518 and G1114674; i.e., mutations in the beta-lactamase gene may be responsible for the high MIC values.</p>
<p>In summary, we found that there are still a lot of genes in <italic>Klebsiella</italic> that belong to hypothetical proteins, and the loci we derived from this study can help to annotate and study these hypothetical proteins. Furthermore, in clinical practice, deep learning-based modeling and prediction by selecting important feature values can significantly improve detection efficiency compared to experimental methods of measuring MIC values, providing doctors with a faster access to information on patient resistance for drug administration and improving the effectiveness of antibiotic use, enabling patients to receive medication promptly. It also reduces the cost of the experiment.</p>
</sec>
<sec id="S5">
<title>Additional Information</title>
<p>CentOS Linux release 7.2.1511 (Core)</p>
<p>Linux version 3.10.0-327.el7.x86_64 (<email>builder@kbuilder.dev.centos.org</email>) (gcc version 4.8.3 20140911 (Red Hat 4.8.3-9) (GCC)</p>
<p>jupyter lab version 0.34.9</p>
<p>Python 3.7.2</p>
</sec>
<sec id="S6">
<title>Data Availability Statement</title>
<p>The metagenomic sequence data included in this study can be found in the NCBI SRA (BioProject accession numbers <ext-link ext-link-type="DDBJ/EMBL/GenBank" xlink:href="PRJNA376414">PRJNA376414</ext-link>, <ext-link ext-link-type="DDBJ/EMBL/GenBank" xlink:href="PRJNA386693">PRJNA386693</ext-link>, and <ext-link ext-link-type="DDBJ/EMBL/GenBank" xlink:href="PRJNA396774">PRJNA396774</ext-link>).</p>
</sec>
<sec id="S7">
<title>Author Contributions</title>
<p>JG and JL conceived ideas and designed the study. AY, JG, XJW, and XLW wrote the manuscript. ZML and RJ performed the bioinformatics analysis. RT and ZQL constructed the machine learning models. All authors read or revised the manuscript and approved the final version.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of Interest</title>
<p>JG, AY, RT, XJW, and RJ are employed by Shanghai Biotecan Pharmaceuticals Co., Ltd. and Shanghai Zhangjiang Institute of Medical Innovation. The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="pudiscl1">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
</body>
<back>
<fn-group>
<fn fn-type="financial-disclosure">
<p><bold>Funding.</bold> This study was mainly supported by the National Key R&#x0026;D Program of China (2018YFE0102400) and secondly by the Talent Development Project for Three-year Action Plan of Shanghai Public Health System Construction (GWV-10.2-XD03 and GWV-10.2-YQ50) and the project Clinical Treatment Optimization Decision System Based on Deep Reinforcement Learning by Shanghai Municipal Commission of Economy and Informatization (2020-RGZN-02039). XJW received funding from the Scientific Instrument Application Methods Project of Shanghai Science and Technology Innovation Action Plan (No. 19142200800).</p>
</fn>
</fn-group>
<ack>
<p>We would like to thank Yingxia Pan and Xiaoming Li who contributed to data pre-processing and programming.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="B1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Baldwin</surname> <given-names>C. M.</given-names></name> <name><surname>Lyseng-Williamson</surname> <given-names>K. A.</given-names></name> <name><surname>Keam</surname> <given-names>S. J.</given-names></name></person-group> (<year>2008</year>). <article-title>Meropenem: a review of its use in the treatment of serious bacterial infections.</article-title> <source><italic>Drugs</italic></source> <volume>68</volume> <fpage>803</fpage>&#x2013;<lpage>838</lpage>. <pub-id pub-id-type="doi">10.2165/00003495-200868060-00006</pub-id> <pub-id pub-id-type="pmid">18416587</pub-id></citation></ref>
<ref id="B2"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bankevich</surname> <given-names>A.</given-names></name> <name><surname>Nurk</surname> <given-names>S.</given-names></name> <name><surname>Antipov</surname> <given-names>D.</given-names></name> <name><surname>Gurevich</surname> <given-names>A. A.</given-names></name> <name><surname>Dvorkin</surname> <given-names>M.</given-names></name> <name><surname>Kulikov</surname> <given-names>A. S.</given-names></name><etal/></person-group> (<year>2012</year>). <article-title>SPAdes: a new genome assembly algorithm and its applications to single-cell sequencing.</article-title> <source><italic>J. Comput. Biol.</italic></source> <volume>19</volume> <fpage>455</fpage>&#x2013;<lpage>477</lpage>. <pub-id pub-id-type="doi">10.1089/cmb.2012.0021</pub-id> <pub-id pub-id-type="pmid">22506599</pub-id></citation></ref>
<ref id="B3"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bonfield</surname> <given-names>J. K.</given-names></name> <name><surname>Marshall</surname> <given-names>J.</given-names></name> <name><surname>Danecek</surname> <given-names>P.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name> <name><surname>Ohan</surname> <given-names>V.</given-names></name> <name><surname>Whitwham</surname> <given-names>A.</given-names></name><etal/></person-group> (<year>2021</year>). <article-title>HTSlib: C library for reading/writing high-throughput sequencing data.</article-title> <source><italic>Gigascience</italic></source> <volume>10</volume>:<fpage>giab007</fpage>. <pub-id pub-id-type="doi">10.1093/gigascience/giab007</pub-id> <pub-id pub-id-type="pmid">33594436</pub-id></citation></ref>
<ref id="B4"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>T.</given-names></name> <name><surname>Guestrin</surname> <given-names>C.</given-names></name></person-group> (<year>2016</year>). &#x201C;<article-title>XGBoost: a scalable tree boosting system</article-title>,&#x201D; in <source><italic>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data</italic></source>, (<publisher-loc>San Francisco CA</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>).</citation></ref>
<ref id="B5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dargan</surname> <given-names>S.</given-names></name> <name><surname>Kumar</surname> <given-names>M.</given-names></name> <name><surname>Ayyagari</surname> <given-names>M. R.</given-names></name> <name><surname>Kumar</surname> <given-names>G.</given-names></name></person-group> (<year>2019</year>). <article-title>A survey of deep learning and its applications: a new paradigm to machine learning.</article-title> <source><italic>Arch. Comput. Methods Eng.</italic></source> <volume>27</volume> <fpage>1071</fpage>&#x2013;<lpage>1092</lpage>.</citation></ref>
<ref id="B6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Effah</surname> <given-names>C. Y.</given-names></name> <name><surname>Sun</surname> <given-names>T.</given-names></name> <name><surname>Liu</surname> <given-names>S.</given-names></name> <name><surname>Wu</surname> <given-names>Y.</given-names></name></person-group> (<year>2020</year>). <article-title><italic>Klebsiella pneumoniae</italic>: an increasing threat to public health.</article-title> <source><italic>Ann. Clin. Microbiol. Antimicrob.</italic></source> <volume>19</volume>:<fpage>1</fpage>. <pub-id pub-id-type="doi">10.1186/s12941-019-0343-8</pub-id> <pub-id pub-id-type="pmid">31918737</pub-id></citation></ref>
<ref id="B7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Eyre</surname> <given-names>D. W.</given-names></name> <name><surname>De Silva</surname> <given-names>D.</given-names></name> <name><surname>Cole</surname> <given-names>K.</given-names></name> <name><surname>Peters</surname> <given-names>J.</given-names></name> <name><surname>Cole</surname> <given-names>M. J.</given-names></name> <name><surname>Grad</surname> <given-names>Y. H.</given-names></name><etal/></person-group> (<year>2017</year>). <article-title>WGS to predict antibiotic MICs for <italic>Neisseria gonorrhoeae</italic>.</article-title> <source><italic>J. Antimicrob. Chemother.</italic></source> <volume>72</volume> <fpage>1937</fpage>&#x2013;<lpage>1947</lpage>. <pub-id pub-id-type="doi">10.1093/jac/dkx067</pub-id> <pub-id pub-id-type="pmid">28333355</pub-id></citation></ref>
<ref id="B8"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>H.</given-names></name></person-group> (<year>2011</year>). <article-title>A statistical framework for SNPs calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data.</article-title> <source><italic>Bioinformatics</italic></source> <volume>27</volume> <fpage>2987</fpage>&#x2013;<lpage>2993</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btr509</pub-id> <pub-id pub-id-type="pmid">21903627</pub-id></citation></ref>
<ref id="B9"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Metcalf</surname> <given-names>B. J.</given-names></name> <name><surname>Chochua</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name> <name><surname>Gertz</surname> <given-names>R. E.</given-names> <suffix>Jr.</suffix></name> <name><surname>Walker</surname> <given-names>H.</given-names></name><etal/></person-group> (<year>2016</year>). <article-title>Penicillin-binding protein transpeptidase signatures for tracking and predicting beta-lactam resistance levels in <italic>Streptococcus pneumoniae</italic>.</article-title> <source><italic>mBio</italic></source> <volume>7</volume>:<fpage>e00756</fpage>-<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1128/mBio.00756-16</pub-id> <pub-id pub-id-type="pmid">27302760</pub-id></citation></ref>
<ref id="B10"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Metcalf</surname> <given-names>B. J.</given-names></name> <name><surname>Chochua</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name> <name><surname>Gertz</surname> <given-names>R. E.</given-names> <suffix>Jr.</suffix></name> <name><surname>Walker</surname> <given-names>H.</given-names></name><etal/></person-group> (<year>2017</year>). <article-title>Validation of beta-lactam minimum inhibitory concentration predictions for pneumococcal isolates with newly encountered penicillin binding protein (PBP) sequences.</article-title> <source><italic>BMC Genomics</italic></source> <volume>18</volume>:<fpage>621</fpage>. <pub-id pub-id-type="doi">10.1186/s12864-017-4017-7</pub-id> <pub-id pub-id-type="pmid">28810827</pub-id></citation></ref>
<ref id="B11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>P.</given-names></name> <name><surname>Li</surname> <given-names>P.</given-names></name> <name><surname>Jiang</surname> <given-names>X.</given-names></name> <name><surname>Bi</surname> <given-names>D.</given-names></name> <name><surname>Xie</surname> <given-names>Y.</given-names></name> <name><surname>Tai</surname> <given-names>C.</given-names></name><etal/></person-group> (<year>2012</year>). <article-title>Complete genome sequence of <italic>Klebsiella pneumoniae</italic> subsp. pneumoniae HS11286, a multidrug-resistant strain isolated from human sputum.</article-title> <source><italic>J. Bacteriol.</italic></source> <volume>194</volume> <fpage>1841</fpage>&#x2013;<lpage>1842</lpage>. <pub-id pub-id-type="doi">10.1128/JB.00043-12</pub-id> <pub-id pub-id-type="pmid">22408243</pub-id></citation></ref>
<ref id="B12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Long</surname> <given-names>S. W.</given-names></name> <name><surname>Olsen</surname> <given-names>R. J.</given-names></name> <name><surname>Eagar</surname> <given-names>T. N.</given-names></name> <name><surname>Beres</surname> <given-names>S. B.</given-names></name> <name><surname>Zhao</surname> <given-names>P.</given-names></name> <name><surname>Davis</surname> <given-names>J. J.</given-names></name><etal/></person-group> (<year>2017</year>). <article-title>Population genomic analysis of 1,777 extended-spectrum beta-lactamase-producing <italic>Klebsiella pneumoniae</italic> isolates, Houston, Texas: unexpected abundance of clonal group 307.</article-title> <source><italic>mBio</italic></source> <volume>8</volume>:<fpage>e00489</fpage>-<lpage>17</lpage>. <pub-id pub-id-type="doi">10.1128/mBio.00489-17</pub-id> <pub-id pub-id-type="pmid">28512093</pub-id></citation></ref>
<ref id="B13"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Naha</surname> <given-names>S.</given-names></name> <name><surname>Sands</surname> <given-names>K.</given-names></name> <name><surname>Mukherjee</surname> <given-names>S.</given-names></name> <name><surname>Saha</surname> <given-names>B.</given-names></name> <name><surname>Dutta</surname> <given-names>S.</given-names></name> <name><surname>Basu</surname> <given-names>S.</given-names></name></person-group> (<year>2021</year>). <article-title>OXA-181-like carbapenemases in <italic>Klebsiella pneumoniae</italic> ST14, ST15, ST23, ST48, and ST231 from septicemic neonates: coexistence with NDM-5, resistome, transmissibility, and genome diversity.</article-title> <source><italic>mSphere</italic></source> <volume>6</volume>:<fpage>e01156</fpage>-<lpage>20</lpage>. <pub-id pub-id-type="doi">10.1128/mSphere.01156-20</pub-id> <pub-id pub-id-type="pmid">33441403</pub-id></citation></ref>
<ref id="B14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Navon-Venezia</surname> <given-names>S.</given-names></name> <name><surname>Kondratyeva</surname> <given-names>K.</given-names></name> <name><surname>Carattoli</surname> <given-names>A.</given-names></name></person-group> (<year>2017</year>). <article-title><italic>Klebsiella pneumoniae</italic>: a major worldwide source and shuttle for antibiotic resistance.</article-title> <source><italic>FEMS Microbiol. Rev.</italic></source> <volume>41</volume> <fpage>252</fpage>&#x2013;<lpage>275</lpage>. <pub-id pub-id-type="doi">10.1093/femsre/fux013</pub-id> <pub-id pub-id-type="pmid">28521338</pub-id></citation></ref>
<ref id="B15"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nguyen</surname> <given-names>M.</given-names></name> <name><surname>Brettin</surname> <given-names>T.</given-names></name> <name><surname>Long</surname> <given-names>S. W.</given-names></name> <name><surname>Musser</surname> <given-names>J. M.</given-names></name> <name><surname>Olsen</surname> <given-names>R. J.</given-names></name> <name><surname>Olson</surname> <given-names>R.</given-names></name><etal/></person-group> (<year>2018</year>). <article-title>Developing an in silico minimum inhibitory concentration panel test for <italic>Klebsiella pneumoniae</italic>.</article-title> <source><italic>Sci. Rep.</italic></source> <volume>8</volume>:<fpage>421</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-017-18972-w</pub-id> <pub-id pub-id-type="pmid">29323230</pub-id></citation></ref>
<ref id="B16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nguyen</surname> <given-names>M.</given-names></name> <name><surname>Long</surname> <given-names>S. W.</given-names></name> <name><surname>McDermott</surname> <given-names>P. F.</given-names></name> <name><surname>Olsen</surname> <given-names>R. J.</given-names></name> <name><surname>Olson</surname> <given-names>R.</given-names></name> <name><surname>Stevens</surname> <given-names>R. L.</given-names></name></person-group> (<year>2019</year>). <article-title>Using machine learning to predict antimicrobial mics and associated genomic features for nontyphoidal <italic>Salmonella</italic>.</article-title> <source><italic>J. Clin. Microbiol.</italic></source> <volume>57</volume>:<fpage>e01260</fpage>-<lpage>18</lpage>.</citation></ref>
<ref id="B17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Okanda</surname> <given-names>T.</given-names></name> <name><surname>Haque</surname> <given-names>A.</given-names></name> <name><surname>Koshikawa</surname> <given-names>T.</given-names></name> <name><surname>Islam</surname> <given-names>A.</given-names></name> <name><surname>Huda</surname> <given-names>Q.</given-names></name> <name><surname>Takemura</surname> <given-names>H.</given-names></name><etal/></person-group> (<year>2020</year>). <article-title>Characteristics of carbapenemase-producing <italic>Klebsiella pneumoniae</italic> isolated in the intensive care unit of the largest tertiary hospital in Bangladesh.</article-title> <source><italic>Front. Microbiol.</italic></source> <volume>11</volume>:<fpage>612020</fpage>. <pub-id pub-id-type="doi">10.3389/fmicb.2020.612020</pub-id> <pub-id pub-id-type="pmid">33519767</pub-id></citation></ref>
<ref id="B18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pataki</surname> <given-names>B. A.</given-names></name> <name><surname>Matamoros</surname> <given-names>S.</given-names></name> <name><surname>van der Putten</surname> <given-names>B. C. L.</given-names></name> <name><surname>Remondini</surname> <given-names>D.</given-names></name> <name><surname>Giampieri</surname> <given-names>E.</given-names></name> <name><surname>Aytan-Aktug</surname> <given-names>D.</given-names></name><etal/></person-group> (<year>2020</year>). <article-title>Understanding and predicting ciprofloxacin minimum inhibitory concentration in <italic>Escherichia coli</italic> with machine learning.</article-title> <source><italic>Sci. Rep.</italic></source> <volume>10</volume>:<fpage>15026</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-020-71693-5</pub-id> <pub-id pub-id-type="pmid">32929164</pub-id></citation></ref>
<ref id="B19"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pesesky</surname> <given-names>M. W.</given-names></name> <name><surname>Hussain</surname> <given-names>T.</given-names></name> <name><surname>Wallace</surname> <given-names>M.</given-names></name> <name><surname>Patel</surname> <given-names>S.</given-names></name> <name><surname>Andleeb</surname> <given-names>S.</given-names></name> <name><surname>Burnham</surname> <given-names>C. D.</given-names></name><etal/></person-group> (<year>2016</year>). <article-title>Evaluation of machine learning and rules-based approaches for predicting antimicrobial resistance profiles in gram-negative <italic>Bacilli</italic> from whole genome sequence data.</article-title> <source><italic>Front. Microbiol.</italic></source> <volume>7</volume>:<fpage>1887</fpage>. <pub-id pub-id-type="doi">10.3389/fmicb.2016.01887</pub-id> <pub-id pub-id-type="pmid">27965630</pub-id></citation></ref>
<ref id="B20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Reyes</surname> <given-names>J.</given-names></name> <name><surname>Aguilar</surname> <given-names>A. C.</given-names></name> <name><surname>Caicedo</surname> <given-names>A.</given-names></name></person-group> (<year>2019</year>). <article-title>Carbapenem-Resistant <italic>Klebsiella pneumoniae</italic>: microbiology key points for clinical practice.</article-title> <source><italic>Int. J. Gen. Med.</italic></source> <volume>28</volume> <fpage>437</fpage>&#x2013;<lpage>446</lpage>. <pub-id pub-id-type="doi">10.2147/IJGM.S214305</pub-id> <pub-id pub-id-type="pmid">31819594</pub-id></citation></ref>
<ref id="B21"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Santhanam</surname> <given-names>R.</given-names></name></person-group> (<year>2016</year>). <article-title>Comparative study of XGBoost4j and gradient boosting for linear regression.</article-title> <source><italic>Int. J. Control Theory Appl.</italic></source> <volume>9</volume> <fpage>1131</fpage>&#x2013;<lpage>1142</lpage>.</citation></ref>
<ref id="B22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Spagnolo</surname> <given-names>A. M.</given-names></name> <name><surname>Orlando</surname> <given-names>P.</given-names></name> <name><surname>Panatto</surname> <given-names>D.</given-names></name> <name><surname>Perdelli</surname> <given-names>F.</given-names></name> <name><surname>Cristina</surname> <given-names>M. L.</given-names></name></person-group> (<year>2014</year>). <article-title>An overview of carbapenem-resistant <italic>Klebsiella pneumoniae</italic>: epidemiology and control measures.</article-title> <source><italic>Rev. Med. Microbiol.</italic></source> <volume>25</volume> <fpage>7</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1097/MRM.0b013e328365c51e</pub-id></citation></ref>
<ref id="B23"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>ValizadehAslani</surname> <given-names>T.</given-names></name> <name><surname>Zhao</surname> <given-names>Z.</given-names></name> <name><surname>Sokhansanj</surname> <given-names>B. A.</given-names></name> <name><surname>Rosen</surname> <given-names>G. L.</given-names></name></person-group> (<year>2020</year>). <article-title>Amino acid k-mer feature extraction for Quantitative Antimicrobial Resistance (AMR) prediction by machine learning and model interpretation for biological insights.</article-title> <source><italic>Biology</italic></source> <volume>9</volume>:<fpage>365</fpage>. <pub-id pub-id-type="doi">10.3390/biology9110365</pub-id> <pub-id pub-id-type="pmid">33126516</pub-id></citation></ref>
<ref id="B24"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>Y.</given-names></name> <name><surname>Niehaus</surname> <given-names>K. E.</given-names></name> <name><surname>Walker</surname> <given-names>T. M.</given-names></name> <name><surname>Iqbal</surname> <given-names>Z.</given-names></name> <name><surname>Walker</surname> <given-names>A. S.</given-names></name> <name><surname>Wilson</surname> <given-names>D. J.</given-names></name><etal/></person-group> (<year>2018</year>). <article-title>Machine learning for classifying tuberculosis drug-resistance from DNA sequencing data.</article-title> <source><italic>Bioinformatics</italic></source> <volume>34</volume> <fpage>1666</fpage>&#x2013;<lpage>1671</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btx801</pub-id> <pub-id pub-id-type="pmid">29240876</pub-id></citation></ref>
<ref id="B25"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>Y.</given-names></name> <name><surname>Walker</surname> <given-names>T. M.</given-names></name> <name><surname>Walker</surname> <given-names>A. S.</given-names></name> <name><surname>Wilson</surname> <given-names>D. J.</given-names></name> <name><surname>Peto</surname> <given-names>T. E. A.</given-names></name> <name><surname>Crook</surname> <given-names>D. W.</given-names></name><etal/></person-group> (<year>2019</year>). <article-title>DeepAMR for predicting co-occurrent resistance of Mycobacterium tuberculosis.</article-title> <source><italic>Bioinformatics</italic></source> <volume>35</volume> <fpage>3240</fpage>&#x2013;<lpage>3249</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btz067</pub-id> <pub-id pub-id-type="pmid">30689732</pub-id></citation></ref>
<ref id="B26"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zador</surname> <given-names>A. M.</given-names></name></person-group> (<year>2019</year>). <article-title>A critique of pure learning and what artificial neural networks can learn from animal brains.</article-title> <source><italic>Nat. Commun.</italic></source> <volume>10</volume>:<fpage>3770</fpage>.</citation></ref>
</ref-list>
<fn-group>
<fn id="footnote1">
<label>1</label>
<p><ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/assembly/GCF_000240185.1">https://www.ncbi.nlm.nih.gov/assembly/GCF_000240185.1</ext-link></p></fn>
</fn-group>
</back>
</article>