<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Microbiol.</journal-id>
<journal-title>Frontiers in Microbiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Microbiol.</abbrev-journal-title>
<issn pub-type="epub">1664-302X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmicb.2022.790063</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Microbiology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>iThermo: A Sequence-Based Model for Identifying Thermophilic Proteins Using a Multi-Feature Fusion Strategy</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Ahmed</surname> <given-names>Zahoor</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1533846/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Zulfiqar</surname> <given-names>Hasan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1101292/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Khan</surname> <given-names>Abdullah Aman</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Gul</surname> <given-names>Ijaz</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1512456/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Dao</surname> <given-names>Fu-Ying</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/640027/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhang</surname> <given-names>Zhao-Yue</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c003"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1503844/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Yu</surname> <given-names>Xiao-Long</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1651208/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Tang</surname> <given-names>Lixia</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/611357/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>School of Life Sciences and Technology, Center for Informational Biology, University of Electronic Science and Technology of China</institution>, <addr-line>Chengdu</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>School of Computer Science and Engineering, University of Electronic Science and Technology of China</institution>, <addr-line>Chengdu</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Sichuan Artificial Intelligence Research Institute</institution>, <addr-line>Yibin</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>Tsinghua Shenzhen International Graduate School, Institute of Biopharmaceutical and Health Engineering, Tsinghua University</institution>, <addr-line>Shenzhen</addr-line>, <country>China</country></aff>
<aff id="aff5"><sup>5</sup><institution>School of Materials Science and Engineering, Hainan University</institution>, <addr-line>Haikou</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Carmen Vargas, University of Seville, Spain</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Yi Xiong, Shanghai Jiao Tong University, China; Jan Zrimec, National Institute of Biology (NIB), Slovenia</p></fn>
<corresp id="c001">&#x002A;Correspondence: Lixia Tang, <email>lixiatang@uestc.edu.cn</email></corresp>
<corresp id="c002">Xiao-Long Yu, <email>yuxiaolong@hainanu.edu.cn</email></corresp>
<corresp id="c003">Zhao-Yue Zhang, <email>zyzhang@uestc.edu.cn</email></corresp>
<fn fn-type="other" id="fn004"><p>This article was submitted to Systems Microbiology, a section of the journal Frontiers in Microbiology</p></fn>
</author-notes>
<pub-date pub-type="epub">
<day>22</day>
<month>02</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>13</volume>
<elocation-id>790063</elocation-id>
<history>
<date date-type="received">
<day>06</day>
<month>10</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>10</day>
<month>01</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2022 Ahmed, Zulfiqar, Khan, Gul, Dao, Zhang, Yu and Tang.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Ahmed, Zulfiqar, Khan, Gul, Dao, Zhang, Yu and Tang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Thermophilic proteins have important application value in biotechnology and industrial processes. The correct identification of thermophilic proteins provides important information for the application of these proteins in engineering. The identification method of thermophilic proteins based on biochemistry is laborious, time-consuming, and high cost. Therefore, there is an urgent need for a fast and accurate method to identify thermophilic proteins. Considering this urgency, we constructed a reliable benchmark dataset containing 1,368 thermophilic and 1,443 non-thermophilic proteins. A multi-layer perceptron (MLP) model based on a multi-feature fusion strategy was proposed to discriminate thermophilic proteins from non-thermophilic proteins. On independent data set, the proposed model could achieve an accuracy of 96.26%, which demonstrates that the model has a good application prospect. In order to use the model conveniently, a user-friendly software package called iThermo was established and can be freely accessed at <ext-link ext-link-type="uri" xlink:href="http://lin-group.cn/server/iThermo/index.html">http://lin-group.cn/server/iThermo/index.html</ext-link>. The high accuracy of the model and the practicability of the developed software package indicate that this study can accelerate the discovery and engineering application of thermally stable proteins.</p>
</abstract>
<kwd-group>
<kwd>thermophilic proteins</kwd>
<kwd>protein feature extraction</kwd>
<kwd>feature selection</kwd>
<kwd>neural network</kwd>
<kwd>iThermo</kwd>
</kwd-group>
<counts>
<fig-count count="3"/>
<table-count count="2"/>
<equation-count count="23"/>
<ref-count count="76"/>
<page-count count="11"/>
<word-count count="8234"/>
</counts>
</article-meta>
</front>
<body>
<sec id="S1" sec-type="intro">
<title>Introduction</title>
<p>In the field of industrial and biotechnology development, researchers usually increase the temperature to shorten the enzymatic reaction time (<xref ref-type="bibr" rid="B58">Tang et al., 2017</xref>). However, the increase in temperature leads to the denaturation of protein, resulting in the loss of protein activity. Maintaining the activity of protein under increasing temperature conditions is a hot topic in the current engineering field. It is well known that temperature is crucial to cellular life. It has been reported that some organisms can live in a high-temperature environment. In general, the organisms that survive at an optimal growth temperature (OGT) below 50&#x00B0;C are regarded as mesophilic organisms, and the organisms that can survive at the OGT of 50&#x00B0;C or above are called thermophilic organisms (<xref ref-type="bibr" rid="B24">Gromiha and Suresh, 2008</xref>). Thermophiles can produce thermally stable proteins and even effectively resist high temperatures of up to 120&#x00B0;C (<xref ref-type="bibr" rid="B17">Fan et al., 2016</xref>; <xref ref-type="bibr" rid="B58">Tang et al., 2017</xref>). Therefore, the study of proteins produced by thermophilic organisms is significant for the development of enzyme engineering (<xref ref-type="bibr" rid="B27">Huang and Gong, 2020</xref>; <xref ref-type="bibr" rid="B65">Wang et al., 2020</xref>; <xref ref-type="bibr" rid="B1">Alim et al., 2021</xref>; <xref ref-type="bibr" rid="B56">Suresh et al., 2021</xref>; <xref ref-type="bibr" rid="B72">Zou et al., 2021</xref>).</p>
<p>There have been many studies on thermophilic proteins. It is found that the thermal stability of proteins is related to amino acid distribution in proteins (<xref ref-type="bibr" rid="B19">Fukuchi and Nishikawa, 2001</xref>; <xref ref-type="bibr" rid="B71">Zhou et al., 2008</xref>). In addition to amino acid distribution, dipeptide composition (DC) contributes effectively to protein thermal stability (<xref ref-type="bibr" rid="B15">Ding et al., 2004</xref>; <xref ref-type="bibr" rid="B68">Zhang and Fang, 2007</xref>; <xref ref-type="bibr" rid="B50">Nakariyakul et al., 2012</xref>). In addition, previous studies have reported that the factors affecting the thermal stability of proteins also include hydrophobicity (<xref ref-type="bibr" rid="B53">Saraboji et al., 2005</xref>; <xref ref-type="bibr" rid="B49">Miyazaki et al., 2006</xref>; <xref ref-type="bibr" rid="B23">Gromiha et al., 2013</xref>), hydrogen bonding (<xref ref-type="bibr" rid="B4">Bleicher et al., 2011</xref>), residues and inter-residue contacts (<xref ref-type="bibr" rid="B21">Gromiha, 2001</xref>; <xref ref-type="bibr" rid="B48">Meruelo et al., 2012</xref>), helical polar surfaces (<xref ref-type="bibr" rid="B30">Jayaraman et al., 2006</xref>), side-chain interactions (<xref ref-type="bibr" rid="B33">Kumar et al., 2000</xref>), and salt bridges (<xref ref-type="bibr" rid="B52">Sadeghi et al., 2006</xref>; <xref ref-type="bibr" rid="B20">Ge et al., 2008</xref>).</p>
<p>Based on these characteristics, some computational models have been developed to predict thermophilic proteins (<xref ref-type="bibr" rid="B65">Wang et al., 2020</xref>). <xref ref-type="bibr" rid="B24">Gromiha and Suresh (2008)</xref> developed a neural network-based model. They reported 89 and 91% accuracy using 5-fold cross-validation and independent dataset, respectively. <xref ref-type="bibr" rid="B37">Lin and Chen (2011)</xref> built the most reliable benchmark dataset at that time, including 915 thermophilic proteins and 793 non-thermophilic proteins. Using amino acid composition (AAC) and dipeptide composition as inputs of support vector machine (SVM), the accuracy for thermophilic proteins and non-thermophilic proteins was 93.8 and 92.7%, respectively. Then, the genetic algorithm combined with SVM was applied to the prediction problem (<xref ref-type="bibr" rid="B63">Wang et al., 2011</xref>; <xref ref-type="bibr" rid="B44">Lv et al., 2020c</xref>). <xref ref-type="bibr" rid="B50">Nakariyakul et al. (2012)</xref> established a computational model on the same dataset constructed by <xref ref-type="bibr" rid="B37">Lin and Chen (2011)</xref>. Their model achieved an accuracy of 93.3% in jackknife cross-validation. In recent years, combined with AAC, evolutionary information, and acid dissociation constant, <xref ref-type="bibr" rid="B17">Fan et al. (2016)</xref> built a prediction model with an accuracy of 93.5%. <xref ref-type="bibr" rid="B58">Tang et al. (2017)</xref> proposed a two-steps discrimination method using the same dataset and achieved an accuracy of 94.44% in 5-fold cross-validation. A voting algorithm for thermophilic proteins prediction has achieved an accuracy of 93.03% (<xref ref-type="bibr" rid="B36">Li J. et al., 2019</xref>). <xref ref-type="bibr" rid="B18">Feng et al. (2020)</xref> developed a reduced AAC-based model and obtained an accuracy of 98.2%. <xref ref-type="bibr" rid="B25">Guo et al. (2020)</xref> used the feature dimension reduction technique to identify thermophilic protein and reported an accuracy of 96.02%.</p>
<p>Although much work has been done to predict thermophilic proteins, the availability of a reliable benchmark dataset, the development of an accurate model based on multi-feature fusion, and the construction of a software package still need to be further improved. Therefore, this study constructed the most reliable benchmark dataset. Subsequently, an accurate model was developed based on this dataset. Based on the model, a software package was established. The following sections will introduce these processes in detail.</p>
</sec>
<sec id="S2" sec-type="materials|methods">
<title>Materials and Methods</title>
<p>The fundamental framework of the present research work includes the following five steps: (1) benchmark dataset construction, (2) feature extraction, (3) feature selection, (4) feature fusion, (5) model training, and (6) software package establishment. The flow chart of the framework is illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption><p>Flow chart of a framework for predicting thermophilic proteins.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-13-790063-g001.tif"/>
</fig>
<sec id="S2.SS1">
<title>Dataset</title>
<p>The cornerstone of a robust and reliable model is to generate a reliable and strict benchmark dataset. In previous literature, scholars used 50&#x00B0;C as a cutoff to construct a benchmark dataset. However, this criterion did not seem objective because proteins might be stable even above the OGT of microorganisms. For instance, a protein produced by microorganisms living at 45&#x00B0;C is likely not to denature at 60&#x00B0;C. According to the 50&#x00B0;C cutoff criterion, this protein is included in the negative dataset, but it should be included in the positive dataset as it is still stable above the 50&#x00B0;C. To eradicate this effect as much as possible, we used <xref ref-type="bibr" rid="B37">Lin and Chen&#x2019;s (2011)</xref> strict and objective standard to generate a benchmark dataset. According to <xref ref-type="bibr" rid="B37">Lin and Chen&#x2019;s (2011)</xref> criterion, the proteins in the microorganism with OGT &#x003E; 60&#x00B0;C and &#x003C;30&#x00B0;C were regarded as thermophilic and non-thermophilic proteins, respectively. Of course, even after using <xref ref-type="bibr" rid="B37">Lin and Chen&#x2019;s (2011)</xref> criterion, the effect mentioned still exists but not as strongly as when compared to the 50&#x00B0;C cutoff criterion. All protein sequences were extracted from a universal protein resource (UniProt). Subsequently, the following steps were used to ensure the quality of protein data: (I) the proteins which have been manually reviewed remained; (II) proteins containing ambiguous residues were excluded; (III) sequences which are a fragment of other proteins were excluded; (IV) proteins which infer from prediction or homology were excluded; (V) to remove redundancy and homology bias, CD-HIT program (<xref ref-type="bibr" rid="B28">Huang et al., 2010</xref>) was used by setting a cutoff of sequence identity to 30%. As a result, the final benchmark dataset contained 1,443 non-thermophilic and 1,366 thermophilic proteins. Our final dataset contains only a few thousand proteins because the growth temperature of some microorganisms is known (<xref ref-type="bibr" rid="B34">Li G. et al., 2019</xref>) and UniProt contains few confirmed proteins. We only included experimental data. Moreover, noise and redundancy were removed, which also caused a reduction in the number of proteins. For training model, the dataset was divided into 80:20 ratios; model was trained on 80% dataset and validated on 20% dataset.</p>
</sec>
<sec id="S2.SS2">
<title>Feature Extraction</title>
<p>Protein sequences were transformed into numerical vectors to identify thermophilic proteins by machine learning methods (<xref ref-type="bibr" rid="B38">Liu et al., 2019</xref>, <xref ref-type="bibr" rid="B39">2020</xref>; <xref ref-type="bibr" rid="B35">Li et al., 2021</xref>; <xref ref-type="bibr" rid="B67">Zhang et al., 2021a</xref>,<xref ref-type="bibr" rid="B66">b</xref>). To accomplish this task, we used the iFeature program (<xref ref-type="bibr" rid="B9">Chen et al., 2018</xref>) to generate seven kinds of protein features, namely amino acid composition (AAC), traditional pseudo amino acid composition (tPseAAC), amphiphilic pseudo amino acid composition (aPseAAC), the composition of <italic>k</italic>-spaced amino acid pairs (CKSAAP), dipeptide composition (DC), dipeptide deviation from the expected mean (DDE), and composition, transition, and distribution (CTD). These features will be described in detail in the following sections.</p>
<sec id="S2.SS2.SSS1">
<title>Amino Acid Composition</title>
<p>Amino acid composition (<xref ref-type="bibr" rid="B3">Bhasin and Raghava, 2004</xref>; <xref ref-type="bibr" rid="B46">Lv Z. et al., 2021</xref>) refers to the occurrence frequencies of 20 amino acid residues in a protein sequence and is defined as:</p>
<disp-formula id="S2.E1">
<label>(1)</label>
<mml:math id="M1">
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo rspace="5.8pt" stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:mfrac>
</mml:mrow>
<mml:mo rspace="7.5pt">,</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x2026;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>f(t)</italic> represents the frequency of <italic>t</italic> amino acid, <italic>N</italic>(<italic>t</italic>) indicates the total number of <italic>t</italic> amino acids in a protein sequence of length <italic>N</italic>.</p>
</sec>
<sec id="S2.SS2.SSS2">
<title>Traditional Pseudo Amino Acid Composition</title>
<p>Traditional pseudo amino acid composition was used to describe residues correlation based on their physicochemical properties (<xref ref-type="bibr" rid="B11">Chou, 2001</xref>). The descriptor uses the 20+&#x03BB; dimensional vectors to represent the protein sequence. The 20 and &#x03BB; dimensions denote the amino acid composition and sequence correlation factor, respectively.</p>
<p>For any protein <italic>P</italic>, its tPseAAC can be represented as:</p>
<disp-formula id="S2.E2">
<label>(2)</label>
<mml:math id="M2">
<mml:mrow>
<mml:mpadded width="+3.3pt">
<mml:mi>P</mml:mi>
</mml:mpadded>
<mml:mo rspace="5.8pt">=</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mn mathvariant="italic">2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mn mathvariant="italic">3</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x2026;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mn mathvariant="italic">20</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mn mathvariant="italic">20</mml:mn>
<mml:mo>+</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x2026;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mn mathvariant="italic">20</mml:mn>
<mml:mo>+</mml:mo>
<mml:mi>&#x03BB;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where the 20+&#x03BB; dimension elements can be formulated as:</p>
<disp-formula id="S2.E3">
<label>(3)</label>
<mml:math id="M3">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mtable displaystyle="true" rowspacing="0pt">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="false">
<mml:mfrac>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="false">
<mml:msubsup>
<mml:mo largeop="true" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>&#x03BC;</mml:mi>
<mml:mo>-</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mn mathvariant="italic">20</mml:mn>
</mml:msubsup>
</mml:mstyle>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mi>&#x03C9;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mstyle displaystyle="false">
<mml:msubsup>
<mml:mo largeop="true" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>-</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mi>&#x03BB;</mml:mi>
</mml:msubsup>
</mml:mstyle>
<mml:msub>
<mml:mi>&#x03C4;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mstyle>
<mml:mo>,</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>&#x03BC;</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mn mathvariant="italic">20</mml:mn>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="false">
<mml:mfrac>
<mml:mrow>
<mml:mi>&#x03C9;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:msub>
<mml:mi>&#x03C4;</mml:mi>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mo>-</mml:mo>
<mml:mn mathvariant="italic">20</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="false">
<mml:msubsup>
<mml:mo largeop="true" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>&#x03BC;</mml:mi>
<mml:mo>-</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mn mathvariant="italic">20</mml:mn>
</mml:msubsup>
</mml:mstyle>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>u</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mi>&#x03C9;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mstyle displaystyle="false">
<mml:msubsup>
<mml:mo largeop="true" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>-</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mi>&#x03BB;</mml:mi>
</mml:msubsup>
</mml:mstyle>
<mml:msub>
<mml:mi>&#x03C4;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mstyle>
<mml:mo>,</mml:mo>
<mml:mn mathvariant="italic">21</mml:mn>
</mml:mrow>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>&#x03BC;</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mrow>
<mml:mn mathvariant="italic">20</mml:mn>
<mml:mo>+</mml:mo>
<mml:mi>&#x03BB;</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
<mml:mi/>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>P</italic><sub><italic>u</italic></sub> and <italic>w</italic> denote the feature vector and weight factor, respectively. Here, we set <italic>w</italic> to 0.05 for saving computational time. The <italic>f</italic><sub><italic>u</italic></sub> shows the amino acids occurrence frequency in a protein <italic>P</italic>. <italic>&#x03C4;<sub><italic>k</italic></sub></italic> represents the <italic>k</italic>-tire sequence correlation factor which is given below by formula:</p>
<disp-formula id="S2.E4">
<label>(4)</label>
<mml:math id="M4">
<mml:mrow>
<mml:msub>
<mml:mi>&#x03C4;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn mathvariant="italic">1</mml:mn>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:munderover>
<mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:msub>
<mml:mi>J</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>&lt;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="S2.Ex1">
<label>(5)</label>
<mml:math id="M5">
<mml:mrow>
<mml:msub>
<mml:mi>J</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn mathvariant="italic">1</mml:mn>
<mml:mn mathvariant="italic">3</mml:mn>
</mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>-</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mn mathvariant="italic">2</mml:mn>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn mathvariant="italic">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>-</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mn mathvariant="italic">2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mn mathvariant="italic">2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>-</mml:mo>
<mml:mi>M</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>H</italic><sub>1</sub>(<italic>R</italic><sub><italic>i</italic></sub>) is the hydrophobicity value, <italic>H</italic><sub>2</sub>(<italic>R</italic><sub><italic>i</italic></sub>) is the hydrophilicity value, and <italic>M</italic> (<italic>R</italic><sub><italic>i</italic></sub>) is the side chain mass of the amino acid residue <italic>R</italic><sub><italic>i</italic></sub>. For detailed descriptions about tPseAAC, please refer to the literature (<xref ref-type="bibr" rid="B11">Chou, 2001</xref>).</p>
</sec>
<sec id="S2.SS2.SSS3">
<title>Amphiphilic Pseudo Amino Acid Composition</title>
<p>This descriptor incorporates a partial sequence-order effect to the amino acids based on hydrophobicity and hydrophilicity (<xref ref-type="bibr" rid="B10">Chou, 2005</xref>). According to aPseAAC, a protein is represented as follows:</p>
<disp-formula id="S2.E6">
<label>(6)</label>
<mml:math id="M6">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mn mathvariant="italic">2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mn mathvariant="italic">3</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x2026;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mn mathvariant="italic">20</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mn mathvariant="italic">20</mml:mn>
<mml:mo>+</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x2026;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mn mathvariant="italic">20</mml:mn>
<mml:mo>+</mml:mo>
<mml:mi>&#x03BB;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x2026;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mn mathvariant="italic">20</mml:mn>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mn mathvariant="italic">2</mml:mn>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>&#x03BB;</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where the first 20-dimension elements represent the AAC, and the remaining dimensions represent the sequence correlation factor similar to tPseAAC. For further details about aPseAAC, please refer to the literature (<xref ref-type="bibr" rid="B10">Chou, 2005</xref>).</p>
</sec>
<sec id="S2.SS2.SSS4">
<title>Composition of <italic>k</italic>-Spaced Amino Acid Pairs</title>
<p>The CKSAAP describes the frequencies of paired amino acids separated by any amino acid with the symbol <italic>k</italic>. The value of <italic>k</italic> may vary from 0 to 5 (<xref ref-type="bibr" rid="B7">Chen et al., 2007</xref>). CKSAAP for (<italic>k</italic> = 0) was formulated as:</p>
<disp-formula id="S2.E7">
<label>(7)</label>
<mml:math id="M7">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn mathvariant="italic">0</mml:mn>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi mathvariant="italic">AA</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mn mathvariant="italic">0</mml:mn>
</mml:msub>
</mml:mfrac>
<mml:mo rspace="4.2pt">,</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi mathvariant="italic">AC</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mn mathvariant="italic">0</mml:mn>
</mml:msub>
</mml:mfrac>
<mml:mo rspace="4.2pt">,</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi mathvariant="italic">AD</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mn mathvariant="italic">0</mml:mn>
</mml:msub>
</mml:mfrac>
<mml:mo rspace="4.2pt">,</mml:mo>
<mml:mi mathvariant="normal">&#x2026;</mml:mi>
<mml:mo rspace="4.2pt">,</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi mathvariant="italic">YY</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mn mathvariant="italic">0</mml:mn>
</mml:msub>
</mml:mfrac>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mn mathvariant="italic">400</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>F</italic><sub>0</sub> represents the CKSAAP for (<italic>k</italic> = 0), <italic>F</italic> represents the frequency of zero spaced paired amino acids, and <italic>N</italic><sub>0</sub> represents total zero spaced amino acid pairs.</p>
</sec>
<sec id="S2.SS2.SSS5">
<title>Dipeptide Composition</title>
<p>Dipeptide composition is the frequencies of dipeptides in a protein sequence and is defined as:</p>
<disp-formula id="S2.E8">
<label>(8)</label>
<mml:math id="M8">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>-</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>Dc</italic>(<italic>g,h</italic>) denotes the frequency of dipeptide (<italic>g,h</italic>), while <italic>N</italic>(<italic>g,h</italic>) denotes the number of times dipeptide (<italic>g</italic>,<italic>h</italic>) present in the protein sequence containing total dipeptides <italic>N</italic> (<xref ref-type="bibr" rid="B54">Saravanan and Gautham, 2015</xref>).</p>
</sec>
<sec id="S2.SS2.SSS6">
<title>Dipeptide Deviation From Expected Means</title>
<p>Dipeptide deviation from expected means proposed by <xref ref-type="bibr" rid="B54">Saravanan and Gautham (2015)</xref>, involves the combination of dipeptide composition (DC), theoretical mean (<italic>T</italic><sub><italic>m</italic></sub>), and theoretical variance (<italic>T</italic><sub><italic>v</italic></sub>), which was defined as:</p>
<disp-formula id="S2.E9">
<label>(9)</label>
<mml:math id="M9">
<mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">DDE</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>-</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where,</p>
<disp-formula id="S2.E10">
<label>(10)</label>
<mml:math id="M10">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>N</mml:mi>
</mml:msub>
</mml:mfrac>
<mml:mo>&#x00D7;</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>N</mml:mi>
</mml:msub>
</mml:mfrac>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>Cg</italic> indicates the total codons code for amino acid <italic>g</italic>, and <italic>Ch</italic> indicates the total codons code for amino acid <italic>h</italic>. <italic>CN</italic> is the number of codons except for the stop codons.</p>
<p>The theoretical variance <italic>Tv</italic> is defined as:</p>
<disp-formula id="S2.E11">
<label>(11)</label>
<mml:math id="M11">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>v</mml:mi>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo rspace="5.3pt" stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn mathvariant="italic">1</mml:mn>
<mml:mo>-</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>-</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>N</italic> denotes the length of the sequence.</p>
</sec>
<sec id="S2.SS2.SSS7">
<title>Composition, Transition, and Distribution</title>
<p>According to the characteristics of amino acids, 20 amino acids can be categorized as polar, neutral, and hydrophobic. According to the definition of CTD, composition (C) is the percent occurrence of polar, neutral, and hydrophobic residues; transition (T) indicates the frequency in transition; and distribution (D) is the position of the first 25, 50, 75, and 100% amino acid of each group.</p>
<disp-formula id="S2.E12">
<label>(12)</label>
<mml:math id="M12">
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>r</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>r</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:mfrac>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo rspace="4.2pt">&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>r</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>r</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>b</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>N</italic>(<italic>r</italic>) and <italic>N</italic> indicate the number of amino acids of type <italic>r</italic> and sequence length, respectively (<xref ref-type="bibr" rid="B61">Tomii and Kanehisa, 1996</xref>; <xref ref-type="bibr" rid="B16">Dubchak et al., 1999</xref>).</p>
</sec>
</sec>
<sec id="S2.SS3">
<title>Feature Selection</title>
<p>Redundant features and noise affect the prediction performance of the model. In order to get the best prediction performance, it is necessary to remove redundant features and noise using feature selection methods (<xref ref-type="bibr" rid="B59">Tang et al., 2020</xref>; <xref ref-type="bibr" rid="B70">Zhang Z. M. et al., 2020</xref>; <xref ref-type="bibr" rid="B14">Dao et al., 2021c</xref>). In this study, the analysis of variance (ANOVA; <xref ref-type="bibr" rid="B57">Tang et al., 2018</xref>) was applied for feature ranking, and a sequential backward selection strategy was used to pick out optimal features. The following section will introduce the method briefly.</p>
<p>Analysis of variance (ANOVA) can be used to select the best feature subsets based on <italic>F</italic>-value. <italic>F</italic>-value is the ratio of the variance between the sample types and the variance within the samples. A feature&#x2019;s greater <italic>F</italic>-value implies that the feature can contribute more to discriminating between positive and negative samples.</p>
<p><italic>F</italic>-value for a feature <italic>m</italic> can be calculated as:</p>
<disp-formula id="S2.E13">
<label>(13)</label>
<mml:math id="M13">
<mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mmultiscripts>
<mml:mi>s</mml:mi>
<mml:none/>
<mml:mn>2</mml:mn>
<mml:mi>b</mml:mi>
<mml:none/>
</mml:mmultiscripts>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mmultiscripts>
<mml:mi>s</mml:mi>
<mml:none/>
<mml:mn>2</mml:mn>
<mml:mi>w</mml:mi>
<mml:none/>
</mml:mmultiscripts>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>s<sup>2</sup><sub><italic>b</italic></sub></italic> is the variance between the features and <italic>s<sup>2</sup><sub><italic>w</italic></sub></italic> is the variance with each feature&#x2019;s sample. These variances can be represented as:</p>
<disp-formula id="S2.E14">
<label>(14)</label>
<mml:math id="M14">
<mml:mrow>
<mml:mrow>
<mml:mmultiscripts>
<mml:mi>s</mml:mi>
<mml:none/>
<mml:mn mathvariant="italic">2</mml:mn>
<mml:mi>b</mml:mi>
<mml:none/>
</mml:mmultiscripts>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:munderover>
<mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mi>K</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo largeop="true" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mfrac>
<mml:mo>-</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo largeop="true" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mi>K</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msubsup>
<mml:mo largeop="true" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mo largeop="true" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mi>K</mml:mi>
</mml:msubsup>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x2062;</mml:mo>
<mml:mi mathvariant="normal">/</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="S2.E15">
<label>(15)</label>
<mml:math id="M15">
<mml:mrow>
<mml:mrow>
<mml:mmultiscripts>
<mml:mi>s</mml:mi>
<mml:none/>
<mml:mn mathvariant="italic">2</mml:mn>
<mml:mi>w</mml:mi>
<mml:none/>
</mml:mmultiscripts>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:munderover>
<mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mi>K</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:munderover>
<mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:munderover>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>-</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mo largeop="true" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mi>K</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:msubsup>
<mml:mo largeop="true" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mo largeop="true" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mi>K</mml:mi>
</mml:msubsup>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>&#x2062;</mml:mo>
<mml:mi mathvariant="normal">/</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>K</italic> denotes the total features, <italic>N</italic> denotes the total samples, <italic>fij(m)</italic> denotes the <italic>m</italic>-th feature of the <italic>j</italic>-th sample in the <italic>i</italic>-th group, and <italic>n</italic><sub><italic>i</italic></sub> denotes sample in the <italic>i</italic>-th group. The degree of freedom for between features <italic>df</italic><sub><italic>b</italic></sub> and within features <italic>df</italic><sub><italic>w</italic></sub> was <italic>K</italic>-1 and <italic>N</italic>-1, respectively. Detailed descriptions about ANOVA can be referred to as reference (<xref ref-type="bibr" rid="B57">Tang et al., 2018</xref>).</p>
</sec>
<sec id="S2.SS4">
<title>Classification</title>
<p>For classification, we examined a number of classifiers, including Support Vector Machine (SVM; <xref ref-type="bibr" rid="B58">Tang et al., 2017</xref>), K Nearest Neighbor (KNN; <xref ref-type="bibr" rid="B76">Zuo et al., 2013</xref>; <xref ref-type="bibr" rid="B74">Zulfiqar et al., 2021a</xref>), Random Forest (RF), and Multi-layer Perceptron (MLP) for training the model. The following sections will introduce these classifiers briefly.</p>
<sec id="S2.SS4.SSS1">
<title>Support Vector Machine</title>
<p>Support vector machine maps the features in multi-dimensional space and defines the optimal hyperplane to separate the two classes using a kernel function. Different kernels functions can be used in SVM. Because of the non-linearity of data, we used radial basis function (RBF), which can be represented for vectors <italic>a</italic> and <italic>b</italic> by formula as:</p>
<disp-formula id="S2.E16">
<label>(16)</label>
<mml:math id="M16">
<mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>b</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mpadded width="+5pt">
<mml:mi mathvariant="italic">exp</mml:mi>
</mml:mpadded>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>-</mml:mo>
<mml:mrow>
<mml:mi>&#x03B3;</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mo fence="true">||</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>-</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mo fence="true">||</mml:mo>
</mml:mrow>
<mml:mn mathvariant="italic">2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where &#x03B3; denotes the training parameter. The tradeoff between a lower training error and large margins is controlled by a regularization factor <italic>C</italic>. In the present study, the value of &#x03B3; and <italic>C</italic> was set to 0.0001 and 900, respectively. For further details about SVM, see (<xref ref-type="bibr" rid="B31">Joachims, 1998</xref>).</p>
</sec>
<sec id="S2.SS4.SSS2">
<title>Random Forest</title>
<p>Random forest is based on ensemble methodology to predict the final results. It involves various decision trees, each containing a decision node, leaf node, and root node. A leaf node is the output of each decision tree. The final output is based on the majority voting system (<xref ref-type="bibr" rid="B42">Lv et al., 2020a</xref>). If we have attributes &#x0398; of a vector <italic>x</italic> and decision tree based on these attributes is <italic>h</italic>(<italic>x</italic>, &#x0398;), then the random forest can be defined as:</p>
<disp-formula id="S2.E17">
<label>(17)</label>
<mml:math id="M17">
<mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">&#x0398;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mn mathvariant="italic">1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn mathvariant="italic">2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x2026;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>In the present study, the hyperparameters maximum depth, minimum sample split, and n_estimators were set 100, 10, and 500, respectively. For a detailed algorithm of random forest, refer to reference (<xref ref-type="bibr" rid="B5">Breiman, 2001</xref>).</p>
</sec>
<sec id="S2.SS4.SSS3">
<title>K Nearest Neighbors</title>
<p>K nearest neighbor is the most commonly used classifier. It represents the feature vectors as points in feature space and calculates the distance between these points. The final decision is made based on the distance between these points. KNN commonly uses the Euclidean distance as the distance metric.</p>
<p>The Euclidean distance is given below:</p>
<disp-formula id="S2.E18">
<label>(18)</label>
<mml:math id="M18">
<mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">dist</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>M</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:munderover>
<mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">1</mml:mn>
</mml:mrow>
<mml:mi>n</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2062;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>-</mml:mo>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>M</italic> and <italic>N</italic> are two feature vectors while <italic>m</italic> shows feature space dimensionality (<xref ref-type="bibr" rid="B62">Uddin et al., 2019</xref>). The present study applied the KNN classifier using hyper parameters n-neighbor, <italic>P</italic>, and leaf-size as 6, 1, and 2, respectively.</p>
</sec>
<sec id="S2.SS4.SSS4">
<title>Multi-Layer Perceptron</title>
<p>Deep learning is also a popular method in bioinformatics (<xref ref-type="bibr" rid="B12">Dao et al., 2021a</xref>,<xref ref-type="bibr" rid="B13">b</xref>; <xref ref-type="bibr" rid="B41">Lv H. et al., 2021</xref>; <xref ref-type="bibr" rid="B64">Wang et al., 2021</xref>; <xref ref-type="bibr" rid="B73">Zulfiqar et al., 2022</xref>). MLP is a feed-forward neural network containing input, hidden, and output layers for receiving input data, processing data, and performing final prediction, respectively. It trains the network using a supervised learning technique known as backpropagation. The following equation describes the output result of each trained neuron.</p>
<disp-formula id="S2.E19">
<label>(19)</label>
<mml:math id="M19">
<mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mi>&#x03B1;</mml:mi>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:munder>
<mml:mo largeop="true" movablelimits="false" symmetric="true">&#x2211;</mml:mo>
<mml:mrow>
<mml:mpadded width="+2.8pt">
<mml:mi>i</mml:mi>
</mml:mpadded>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="italic">&#x2004;1</mml:mn>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:mpadded width="+5pt">
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mpadded>
<mml:mo>&#x2062;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>x</italic><sub><italic>i</italic></sub> indicates the input values of the firing neuron, <italic>w</italic><sub><italic>i</italic></sub> are their weights, <italic>f</italic> represents the activation function, and <italic>b</italic> presents the activation threshold of the neuron. For a detailed MLP algorithm, refer to the reference (<xref ref-type="bibr" rid="B60">Taud and Mas, 2018</xref>). In the present study, rectified linear activation unit (ReLU) was used as an activation function in the hidden layer; for the outer layer activation function, a sigmoid was used. Input, hidden, and output layers containing 83, 100, and 1 neuron, respectively, were used to train the model. The detail of hyperparameters is presented in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap position="float" id="T1">
<label>TABLE 1</label>
<caption><p>Best hyperparameters for MLP classifier.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left">Hyperparameters</td>
<td valign="top" align="center">Value</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Batch size</td>
<td valign="top" align="center">60</td>
</tr>
<tr>
<td valign="top" align="left">Epochs</td>
<td valign="top" align="center">1200</td>
</tr>
<tr>
<td valign="top" align="left">Learning rate</td>
<td valign="top" align="center">0.001</td>
</tr>
<tr>
<td valign="top" align="left">Momentum</td>
<td valign="top" align="center">0.8</td>
</tr>
<tr>
<td valign="top" align="left">Decay</td>
<td valign="top" align="center">1e<sup>&#x2013;8</sup></td>
</tr>
<tr>
<td valign="top" align="left">Nesterov</td>
<td valign="top" align="center">True</td>
</tr>
<tr>
<td valign="top" align="left">Verbose</td>
<td valign="top" align="center">1</td>
</tr>
</tbody>
</table></table-wrap>
</sec>
</sec>
<sec id="S2.SS5">
<title>Performance Evaluation</title>
<p>In order to evaluate the overall model performance, the following parameters were used (<xref ref-type="bibr" rid="B43">Lv et al., 2020b</xref>,<xref ref-type="bibr" rid="B45">d</xref>; <xref ref-type="bibr" rid="B55">Shao et al., 2021</xref>).</p>
<disp-formula id="S2.E20">
<label>(20)</label>
<mml:math id="M20">
<mml:mrow>
<mml:mi mathvariant="italic">Sn</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi mathvariant="italic">TP</mml:mi>
<mml:mrow>
<mml:mi mathvariant="italic">TP</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="italic">FN</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="S2.E21">
<label>(21)</label>
<mml:math id="M21">
<mml:mrow>
<mml:mi mathvariant="italic">Sp</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi mathvariant="italic">TN</mml:mi>
<mml:mrow>
<mml:mi mathvariant="italic">TN</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="italic">FP</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="S2.E22">
<label>(22)</label>
<mml:math id="M22">
<mml:mrow>
<mml:mi mathvariant="italic">Acc</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="italic">TP</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="italic">TN</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">TP</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="italic">FP</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="italic">TN</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="italic">FN</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="S2.Ex2">
<label>(23)</label>
<mml:math id="M23">
<mml:mrow>
<mml:mi mathvariant="italic">MCC</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">TP</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi mathvariant="italic">TN</mml:mi>
</mml:mrow>
<mml:mo>-</mml:mo>
<mml:mrow>
<mml:mi mathvariant="italic">FP</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi mathvariant="italic">FN</mml:mi>
</mml:mrow>
</mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo>+</mml:mo>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>Sn, Sp, Acc</italic>, and <italic>MCC</italic> denote sensitivity, specificity, accuracy, and Matthews&#x2019;s correlation coefficient. Thermophilic proteins classified as thermophilic were denoted <italic>TP</italic> (true positive), Non-thermophilic proteins classified as non-thermophilic were denoted <italic>TN</italic> (true negative), Non-thermophilic proteins classified as thermophilic were denoted by <italic>FP</italic> (false positive), and thermophilic proteins classified as non-thermophilic were denoted by <italic>FN</italic> (false negative).</p>
</sec>
</sec>
<sec id="S3" sec-type="results|discussion">
<title>Results and Discussion</title>
<sec id="S3.SS1">
<title>Performance Evaluation</title>
<p>For performance evaluation, seven descriptors including AAC, tPseAAC, aPseAAC, DC, DDE, CKSAAP, and CTD were used to create numerical vectors from protein sequences. In order to use these numerical vectors, MLP-based models were trained to evaluate their performances. Results showed that the AUC are 0.9723, 0.9551, 0.9519, 0.8812, 0.9081, 0.9081, and 0.9786 for AAC, tPseAAC, aPseAAC, DC, DDE, CKSAAP, and CTD, respectively (as shown in <xref ref-type="table" rid="T2">Table 2</xref>). In order to remove the redundant features and improve the prediction performance of the model, a feature selection method should be used to pick out the optimal features from each descriptor. In this work, ANOVA was used to rank features for selecting the best feature subsets from the seven types of descriptors. <xref ref-type="table" rid="T2">Table 2</xref> also recorded the performance of each descriptor after feature selection. It showed that AAC, tPseAAC, aPseAAC, DC, DDE, CKSAAP, and CTD produced the best AUC of 0.9735, 0.9580, 0.9610, 0.9143, 0.9165, 0.8349, and 0.9644, respectively. Obviously, the performance of each descriptor increased after the feature selection except the CTD descriptor; therefore, we considered all features of CTD in our study.</p>
<table-wrap position="float" id="T2">
<label>TABLE 2</label>
<caption><p>Performance of descriptors before and after feature selection and in feature fusion.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left"></td>
<td valign="top" align="center">Descriptors</td>
<td valign="top" align="center">SN</td>
<td valign="top" align="center">SP</td>
<td valign="top" align="center">AAC</td>
<td valign="top" align="center">MCC</td>
<td valign="top" align="center">AUC</td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Before feature selection</td>
<td valign="top" align="center">ACC</td>
<td valign="top" align="center">0.9304</td>
<td valign="top" align="center">0.9308</td>
<td valign="top" align="center">0.9306</td>
<td valign="top" align="center">0.8626</td>
<td valign="top" align="center">0.9723</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">tPseAAC</td>
<td valign="top" align="center">0.9011</td>
<td valign="top" align="center">0.8793</td>
<td valign="top" align="center">0.8899</td>
<td valign="top" align="center">0.7914</td>
<td valign="top" align="center">0.9551</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">aPseAAC</td>
<td valign="top" align="center">0.8901</td>
<td valign="top" align="center">0.8720</td>
<td valign="top" align="center">0.8808</td>
<td valign="top" align="center">0.7714</td>
<td valign="top" align="center">0.9519</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">DC</td>
<td valign="top" align="center">0.7546</td>
<td valign="top" align="center">0.8720</td>
<td valign="top" align="center">0.8149</td>
<td valign="top" align="center">0.5963</td>
<td valign="top" align="center">0.8812</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">DDE</td>
<td valign="top" align="center">0.8022</td>
<td valign="top" align="center">0.8374</td>
<td valign="top" align="center">0.8203</td>
<td valign="top" align="center">0.6319</td>
<td valign="top" align="center">0.9081</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">CKSAAP</td>
<td valign="top" align="center">0.7912</td>
<td valign="top" align="center">0.5398</td>
<td valign="top" align="center">0.6619</td>
<td valign="top" align="center">0.3855</td>
<td valign="top" align="center">0.7365</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">CTD</td>
<td valign="top" align="center">0.9377</td>
<td valign="top" align="center">0.9100</td>
<td valign="top" align="center">0.9235</td>
<td valign="top" align="center">0.8612</td>
<td valign="top" align="center">0.9786</td>
</tr>
<tr>
<td valign="top" align="left">After feature selection</td>
<td valign="top" align="center">ACC</td>
<td valign="top" align="center">0.9524</td>
<td valign="top" align="center">0.9239</td>
<td valign="top" align="center">0.9377</td>
<td valign="top" align="center">0.8902</td>
<td valign="top" align="center">0.9735</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">tPseAAC</td>
<td valign="top" align="center">0.8938</td>
<td valign="top" align="center">0.8962</td>
<td valign="top" align="center">0.8950</td>
<td valign="top" align="center">0.7943</td>
<td valign="top" align="center">0.9580</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">aPseAAC</td>
<td valign="top" align="center">0.8971</td>
<td valign="top" align="center">0.8824</td>
<td valign="top" align="center">0.8895</td>
<td valign="top" align="center">0.7863</td>
<td valign="top" align="center">0.9610</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">DC</td>
<td valign="top" align="center">0.8859</td>
<td valign="top" align="center">0.8754</td>
<td valign="top" align="center">0.8416</td>
<td valign="top" align="center">0.6620</td>
<td valign="top" align="center">0.9143</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">DDE</td>
<td valign="top" align="center">0.7802</td>
<td valign="top" align="center">0.8651</td>
<td valign="top" align="center">0.8238</td>
<td valign="top" align="center">0.6430</td>
<td valign="top" align="center">0.9165</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">CKSAAP</td>
<td valign="top" align="center">0.7070</td>
<td valign="top" align="center">0.8374</td>
<td valign="top" align="center">0.7740</td>
<td valign="top" align="center">0.5156</td>
<td valign="top" align="center">0.8349</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">CTD</td>
<td valign="top" align="center">0.9167</td>
<td valign="top" align="center">0.9135</td>
<td valign="top" align="center">0.9150</td>
<td valign="top" align="center">0.8330</td>
<td valign="top" align="center">0.9644</td>
</tr>
<tr>
<td valign="top" align="left"/><td valign="top" align="center">Feature fusion</td>
<td valign="top" align="center">0.9634</td>
<td valign="top" align="center">0.9619</td>
<td valign="top" align="center">0.9626</td>
<td valign="top" align="center">0.9269</td>
<td valign="top" align="center">0.9864</td>
</tr>
</tbody>
</table></table-wrap>
<p>The above results and analysis have demonstrated that each descriptor has useful information to discriminate thermophilic proteins from non-thermophilic proteins. We adopted a feature fusion strategy to include the valuable information of all selected features from each descriptor in model training. In feature fusion, the selected optimal feature subsets of seven descriptors were fused and inputted into the MLP classifier to distinguish thermophilic proteins from non-thermophilic proteins. <xref ref-type="table" rid="T2">Table 2</xref> shows that the AUC increased to 0.9864, suggesting that feature fusion is very effective and has made an outstanding contribution to improving the model&#x2019;s prediction performance.</p>
</sec>
<sec id="S3.SS2">
<title>Performance Comparison on Different Algorithms</title>
<p>In order to demonstrate that the MLP classifier has better prediction performance, we also investigated the performance of other machine learning methods, including SVM, Random forest, and KNN. These methods were trained and tested using the same fused features. The results are recorded in <xref ref-type="fig" rid="F2">Figure 2</xref>. As shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, the performance of MLP classifiers was better than other classifiers. Therefore, we considered using a MLP-based model to establish a software package.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption><p>Performance comparison of MLP classifier with other classifiers.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-13-790063-g002.tif"/>
</fig>
</sec>
<sec id="S3.SS3">
<title>Comparison to Other Models</title>
<p>Many models have been proposed for thermophilic protein identification (<xref ref-type="bibr" rid="B24">Gromiha and Suresh, 2008</xref>; <xref ref-type="bibr" rid="B37">Lin and Chen, 2011</xref>; <xref ref-type="bibr" rid="B63">Wang et al., 2011</xref>; <xref ref-type="bibr" rid="B50">Nakariyakul et al., 2012</xref>; <xref ref-type="bibr" rid="B17">Fan et al., 2016</xref>; <xref ref-type="bibr" rid="B58">Tang et al., 2017</xref>; <xref ref-type="bibr" rid="B36">Li J. et al., 2019</xref>; <xref ref-type="bibr" rid="B18">Feng et al., 2020</xref>; <xref ref-type="bibr" rid="B25">Guo et al., 2020</xref>). All proposed models were established based on machine learning methods and were evaluated by cross-validation. However, our model was examined on independent data. Moreover, the benchmark dataset used in the present study was rigorous and objective. Moreover, most of these published works did not establish available tools that are not only non-practical but also prevent us from making a fair comparison. The only available web-server for the identification of thermophilic proteins was established by <xref ref-type="bibr" rid="B37">Lin and Chen (2011)</xref>. We performed a comparison with the web server using the same validation dataset. Their model (<xref ref-type="bibr" rid="B37">Lin and Chen, 2011</xref>) displayed 95.30% accuracy, while our model produced an accuracy of 96.26%.</p>
</sec>
<sec id="S3.SS4">
<title>Feature Analysis</title>
<p>Our model produces good prediction performance and shows that the characteristics used can effectively characterize thermophilic proteins. Thus, we performed an analysis on features based on their contribution to model performance. In order to find feature contribution, we used permutation feature importance. The contribution of features to the performance of the model is represented in <xref ref-type="supplementary-material" rid="DS1">Supplementary Table 1</xref>. The following section will analyze the feature of each descriptor briefly.</p>
<p>The composition and arrangement of amino acids determine the unique function of a protein. At present, the research on thermophilic proteins uses the composition characteristics of amino acids. The current study involves a detailed analysis of AAC. We found that the frequencies of alanine (A), lysine (K), valine (V), isoleucine (I), glutamine (Q), aspartic acid (D), tyrosine (Y), serine (S), glutamic acid (E), and threonine (T) were significantly different between the two classes. It is speculated that these amino acids have crucial information in providing either thermophilicity or non-thermophilicity to proteins. Tyrosine contributed the most to model performance among these amino acids and showed the weight 0.0249 &#x00B1; 0.0080. Moreover, lysine, glutamic acid, glutamine, and aspartic acid also contributed well to model performance and showed the weights 0.0033 &#x00B1; 0.0026, 0.0041 &#x00B1; 0.0017, 0.0036 &#x00B1; 0.0049, and 0.0162 &#x00B1; 0.0036, respectively. Glutamate, lysine, tyrosine, glutamic acid, and aspartic acid residues were more common in thermophilic proteins than non-thermophilic proteins. Thermophilic proteins contain highly charged amino acids, which contribute to the thermal stability of proteins. Lysine, glutamine, aspartic acid, and glutamic acid residues belong to charged amino acids, while tyrosine belongs to polar amino acids. These amino acids participate in forming salt bridges and hydrogen bonds, which provide thermal stability to proteins. These results are consistent with previous studies (<xref ref-type="bibr" rid="B40">Liu et al., 2011</xref>; <xref ref-type="bibr" rid="B63">Wang et al., 2011</xref>; <xref ref-type="bibr" rid="B51">Panja et al., 2020</xref>).</p>
<p>Valine and isoleucine showed good ability for thermophilic protein identification. In permutation feature importance, valine and isoleucine showed the weights 0.0201 &#x00B1; 0.0056 and 0.0101 &#x00B1; 0.0028, respectively. Isoleucine and valine are hydrophobic amino acids. It has been reported that hydrophobicity contributes to the thermal stability of proteins, as during protein folding, hydrophobic amino acids get buried inside the protein to form a hydrophobic core; this hydrophobic core contributes to the thermal stability of proteins (<xref ref-type="bibr" rid="B2">Baldwin, 2007</xref>; <xref ref-type="bibr" rid="B24">Gromiha and Suresh, 2008</xref>).</p>
<p>Amino acid alanine, threonine, and serine indicated an important role in model performance and showed the weights 0.0087 &#x00B1; 0.0018, 0.0122 &#x00B1; 0.0045, and 0.0031 &#x00B1; 0.0039, respectively. <xref ref-type="fig" rid="F3">Figure 3</xref> illustrates the contribution of AAC features to model performance. Non-thermophilic proteins contain more alanine, threonine, and serine residues than thermophilic proteins, consistent with a previous study by <xref ref-type="bibr" rid="B6">Cambillau and Claverie (2000)</xref>. Alanine carries less charge, while threonine and serine are neutral amino acids, so these amino acids are rarely involved in forming hydrogen bonds and salt bridges, indicating that the proteins enriched with these amino acids can be prone to thermal denaturation (<xref ref-type="bibr" rid="B37">Lin and Chen, 2011</xref>).</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption><p>Contribution of features of all descriptors to model performance.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-13-790063-g003.tif"/>
</fig>
<p>Amino acid composition is an excellent descriptor to discriminate thermophilic proteins from non-thermophilic proteins. Previous studies have also confirmed the contribution of AAC to protein classification tasks (<xref ref-type="bibr" rid="B24">Gromiha and Suresh, 2008</xref>; <xref ref-type="bibr" rid="B47">Mahmoudi et al., 2016</xref>). Although AAC plays a good role in protein classification, it also lacks sequence information. The traditional tPseAAC and aPseAAC (<xref ref-type="bibr" rid="B11">Chou, 2001</xref>, <xref ref-type="bibr" rid="B10">2005</xref>) are good options for the lack of sequence information in AAC. <xref ref-type="bibr" rid="B63">Wang et al. (2011)</xref> and <xref ref-type="bibr" rid="B8">Chen et al. (2016)</xref> also confirmed the critical role of these descriptors in protein classification.</p>
<p>Both tPseAAC and aPseAAC are used to describe the sequence information of amino acid residues in protein sequence. In tPseAAC, Xc1.K, Xc1.E, Xc1.D, Xc1.Q, Xc1.T, Xc1.A, Xc1.G, and Xc1.S were valuable features with the weights of 0.0045 &#x00B1; 0.0019, 0.0041 &#x00B1; 0.0017, 0.0040 &#x00B1; 0.0029, 0.0036 &#x00B1; 0.0049, 0.0035 &#x00B1; 0.0015, 0.0030 &#x00B1; 0.0029, 0.0026 &#x00B1; 0.0027, and 0.0018 &#x00B1; 0.0026, respectively (<xref ref-type="fig" rid="F3">Figure 3</xref>). The features Pc1.Q, Pc1.E, Pc1.I, Pc1.T, Pc1.A, Pc1.S, Pc1.G, and Pc1.K in aPseAAC presented important contribution to model performance. They showed the weights 0.0086 &#x00B1; 0.0030, 0.0081 &#x00B1; 0.0046, 0.0060 &#x00B1; 0.0057, 0.0047 &#x00B1; 0.0030, 0.0047 &#x00B1; 0.0026, 0.0042 &#x00B1; 0.0030, 0.0034 &#x00B1; 0.0023, and 0.0018 &#x00B1; 0.0014, respectively (<xref ref-type="fig" rid="F3">Figure 3</xref>). Our in-depth analysis showed that hydrophobic amino acid and polar amino acid based features were more frequent in thermophilic protein, while uncharged and neutral amino acid based features were more frequent in non-thermophilic proteins.</p>
<p>Dipeptides are also an important feature to distinguish thermophilic proteins from non-thermophilic proteins. Our statistical analysis showed that the occurrence frequencies of KE, LK, EE, EK, AA, LA, KI, IK, KK, and EI have a considerable variance between the two classes of proteins. The ranking of features also confirmed the role of these dipeptides in model performance. Dipeptide KE, LK, EE, EK, AA, LA, KI, IK, KK, and EI showed the weights 0.0113 &#x00B1; 0.0029, 0.0086 &#x00B1; 0.0045, 0.0072 &#x00B1; 0.0019, 0.0070 &#x00B1; 0.0040, 0.0069 &#x00B1; 0.0043, 0.0058 &#x00B1; 0.0010, 0.0043 &#x00B1; 0.0013, 0.0040 &#x00B1; 0.0017, 0.0029 &#x00B1; 0.0026, and 0.0023 &#x00B1; 0.0017, respectively (<xref ref-type="fig" rid="F3">Figure 3</xref>). Dipeptide KE, LK, EE, EK, KI, IK, KK, and EI have charged at biological pH, showing a great trend of forming salt bridges and hydrogen bonds, which contributes to the thermal stability of proteins. AA and LA have poor charge capability and were found more in non-thermophilic proteins (<xref ref-type="bibr" rid="B50">Nakariyakul et al., 2012</xref>; <xref ref-type="bibr" rid="B51">Panja et al., 2020</xref>). Previous studies have also confirmed the role of dipeptide composition in identifying thermophilic proteins (<xref ref-type="bibr" rid="B22">Gromiha et al., 2005</xref>; <xref ref-type="bibr" rid="B37">Lin and Chen, 2011</xref>). MLP model trained on these selected features reveals that these features have good capability to distinguish thermophilic proteins.</p>
<p>The dipeptide deviation from the expected mean also showed meaningful information for the identification of thermophilic proteins. Features including EE, AA, and KE deviation from expected mean showed good ability to identify thermophilic proteins (<xref ref-type="table" rid="T2">Table 2</xref>). The dipeptide deviation for EE, AA, and KE showed the weights 0.0098 &#x00B1; 0.0032, 0.0039 &#x00B1; 0.0028, and 0.0025 &#x00B1; 0.0012, respectively (<xref ref-type="fig" rid="F3">Figure 3</xref>). Previous studies have also reported the effective contribution of dipeptide deviating from the expected mean in protein classification tasks (<xref ref-type="bibr" rid="B54">Saravanan and Gautham, 2015</xref>; <xref ref-type="bibr" rid="B26">Ho Thanh Lam et al., 2020</xref>). In addition to these dipeptide-related descriptors, we also considered the composition of <italic>k</italic>-spaced amino acid pairs, representing the paired amino acid frequency separated by any other amino acid. It is a valuable descriptor and has been widely used in previous studies for protein classification (<xref ref-type="bibr" rid="B29">Jang et al., 2020</xref>; <xref ref-type="bibr" rid="B32">Ju and Wang, 2020</xref>; <xref ref-type="bibr" rid="B69">Zhang L. et al., 2020</xref>; <xref ref-type="bibr" rid="B75">Zulfiqar et al., 2021b</xref>). In the present study, E<sup>&#x2217;&#x2217;</sup>K, E<sup>&#x2217;&#x2217;&#x2217;</sup>K, A<sup>&#x2217;&#x2217;</sup>A, and A<sup>&#x2217;</sup>A were found to be containing meaningful information for thermophilic protein identification and showed the weight 0.0077 &#x00B1; 0.0041, 0.0057 &#x00B1; 0.0014, 0.0031 &#x00B1; 0.0034, and 0.0028 &#x00B1; 0.0015, respectively (<xref ref-type="fig" rid="F3">Figure 3</xref>).</p>
<p>Composition, transition, and distribution involves the composition, transition, and distribution of hydrophobic, polar, and neutral residues. Like other descriptors, the hydrophobic and polar residue-based features of CTD were more frequent in thermophilic proteins while neutral residues-based features were more frequent in non-thermophilic proteins. Permutation feature importance of descriptor CTD is represented in <xref ref-type="supplementary-material" rid="DS1">Supplementary Table 2</xref>. In previous studies, the CTD has been extensively used for protein classification purposes. <xref ref-type="bibr" rid="B63">Wang et al. (2011)</xref> and <xref ref-type="bibr" rid="B75">Zulfiqar et al. (2021b)</xref> also reported CTD as a valuable descriptor for thermophilic protein identification. In the present study, the CTD showed an excellent capability to identify thermophilic proteins (<xref ref-type="table" rid="T2">Table 2</xref>). For CTD, all features performed better than the selected features, so we used CTD features without selection. MLP model trained on CTD features performed good results (<xref ref-type="table" rid="T2">Table 2</xref>).</p>
</sec>
</sec>
<sec id="S4">
<title>iThermo</title>
<p>In addition to proposing a validated model, it is essential to establish a tool to promote the application of the model. To meet this requirement, we established an application software package, iThermo <ext-link ext-link-type="uri" xlink:href="http://lin-group.cn/server/iThermo/index.html">http://lin-group.cn/server/iThermo/index.html</ext-link>. The software package can provide easy access to the model. The software package can be used to make efficient and accurate predictions for thermophilic proteins. It is anticipated that this study will provide a good alternative to laborious, expensive, and time-consuming laboratory practices.</p>
</sec>
<sec id="S5" sec-type="conclusion">
<title>Conclusion</title>
<p>Thermophilic proteins can withstand the harsh conditions of elevated temperature. Thermophilic proteins have attracted much attention in biotechnology and industrial applications. High temperature leads to protein denaturation, so it is urgent to establish a reliable identification method of thermophilic proteins. The identification of thermophilic proteins based on biochemistry is time-consuming, laborious, and expensive. The computational method-based thermophilic protein identification can provide an attractive choice for rapid, cost-effective, and straightforward identification of thermophilic proteins.</p>
<p>Considering this urgency, this study constructed a reliable benchmark dataset and used this dataset to train an MLP classifier. The model has good performance on an independent dataset and can accurately identify thermophilic proteins with an accuracy of 96.20%. In order to facilitate access to the model, a software package was also established. The high performance of the model and its availability as flexible packaging can provide a good choice for thermophilic protein study.</p>
</sec>
<sec id="S6" sec-type="data-availability">
<title>Data Availability Statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="supplementary-material" rid="DS1">Supplementary Material</xref>, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec id="S7">
<title>Author Contributions</title>
<p>LT, X-LY, and Z-YZ conceived and designed the study. ZA conducted the experiments, implemented algorithms, performed the analysis, and wrote the manuscript. ZA, AK, and F-YD established a software package. IG and HZ reviewed and edited the manuscript. LT supervised the study. All authors contributed to the article and approved the submitted version.</p>
</sec>
<sec id="conf1" sec-type="COI-statement">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="pudiscl1" sec-type="disclaimer">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
</body>
<back>
<sec id="S8" sec-type="funding-information">
<title>Funding</title>
<p>This work was supported by a grant from the National Natural Science Foundation of China (62102067).</p>
</sec>
<sec id="S9" sec-type="supplementary-material">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fmicb.2022.790063/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fmicb.2022.790063/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.docx" id="DS1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alim</surname> <given-names>A.</given-names></name> <name><surname>Rafay</surname> <given-names>A.</given-names></name> <name><surname>Naseem</surname> <given-names>I.</given-names></name></person-group> (<year>2021</year>). <article-title>PoGB-pred: prediction of antifreeze proteins sequences using amino acid composition with feature selection followed by a sequential-based ensemble approach.</article-title> <source><italic>Curr. Bioinform.</italic></source> <volume>16</volume> <fpage>446</fpage>&#x2013;<lpage>456</lpage>. <pub-id pub-id-type="doi">10.2174/1574893615999200707141926</pub-id></citation></ref>
<ref id="B2"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Baldwin</surname> <given-names>R. L.</given-names></name></person-group> (<year>2007</year>). <article-title>Energetics of protein folding.</article-title> <source><italic>J. Mol. Biol.</italic></source> <volume>371</volume> <fpage>283</fpage>&#x2013;<lpage>301</lpage>.</citation></ref>
<ref id="B3"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bhasin</surname> <given-names>M.</given-names></name> <name><surname>Raghava</surname> <given-names>G. P.</given-names></name></person-group> (<year>2004</year>). <article-title>Classification of nuclear receptors based on amino acid composition and dipeptide composition.</article-title> <source><italic>J. Biol. Chem.</italic></source> <volume>279</volume> <fpage>23262</fpage>&#x2013;<lpage>23266</lpage>. <pub-id pub-id-type="doi">10.1074/jbc.M401932200</pub-id> <pub-id pub-id-type="pmid">15039428</pub-id></citation></ref>
<ref id="B4"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bleicher</surname> <given-names>L.</given-names></name> <name><surname>Prates</surname> <given-names>E. T.</given-names></name> <name><surname>Gomes</surname> <given-names>T. C.</given-names></name> <name><surname>Silveira</surname> <given-names>R. L.</given-names></name> <name><surname>Nascimento</surname> <given-names>A. S.</given-names></name> <name><surname>Rojas</surname> <given-names>A. L.</given-names></name><etal/></person-group> (<year>2011</year>). <article-title>Molecular basis of the thermostability and thermophilicity of laminarinases: X-ray structure of the hyperthermostable laminarinase from <italic>Rhodothermus marinus</italic> and molecular dynamics simulations.</article-title> <source><italic>J. Phys. Chem.</italic></source> <volume>115</volume> <fpage>7940</fpage>&#x2013;<lpage>7949</lpage>. <pub-id pub-id-type="doi">10.1021/jp200330z</pub-id> <pub-id pub-id-type="pmid">21619042</pub-id></citation></ref>
<ref id="B5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Breiman</surname> <given-names>L.</given-names></name></person-group> (<year>2001</year>). <article-title>Random forests.</article-title> <source><italic>Mach. Learn.</italic></source> <volume>45</volume> <fpage>5</fpage>&#x2013;<lpage>32</lpage>.</citation></ref>
<ref id="B6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cambillau</surname> <given-names>C.</given-names></name> <name><surname>Claverie</surname> <given-names>J. S. M.</given-names></name></person-group> (<year>2000</year>). <article-title>Structural and genomic correlates of hyperthermostability.</article-title> <source><italic>J. Biol. Chem.</italic></source> <volume>275</volume> <fpage>32383</fpage>&#x2013;<lpage>32386</lpage>. <pub-id pub-id-type="doi">10.1074/jbc.C000497200</pub-id> <pub-id pub-id-type="pmid">10940293</pub-id></citation></ref>
<ref id="B7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>K.</given-names></name> <name><surname>Kurgan</surname> <given-names>L. A.</given-names></name> <name><surname>Ruan</surname> <given-names>J.</given-names></name></person-group> (<year>2007</year>). <article-title>Prediction of flexible/rigid regions from protein sequences using k-spaced amino acid pairs.</article-title> <source><italic>BMC Struct. Biol.</italic></source> <volume>7</volume>:<fpage>25</fpage>. <pub-id pub-id-type="doi">10.1186/1472-6807-7-25</pub-id> <pub-id pub-id-type="pmid">17437643</pub-id></citation></ref>
<ref id="B8"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>X. X.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>W. C.</given-names></name> <name><surname>Wu</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>W.</given-names></name> <name><surname>Ding</surname> <given-names>H.</given-names></name><etal/></person-group> (<year>2016</year>). <article-title>Identification of bacterial cell wall lyases via pseudo amino acid composition.</article-title> <source><italic>Biomed. Res. Int</italic></source> <volume>2016</volume>:<fpage>1654623</fpage>. <pub-id pub-id-type="doi">10.1155/2016/1654623</pub-id> <pub-id pub-id-type="pmid">27437396</pub-id></citation></ref>
<ref id="B9"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Z.</given-names></name> <name><surname>Zhao</surname> <given-names>P.</given-names></name> <name><surname>Li</surname> <given-names>F.</given-names></name> <name><surname>Leier</surname> <given-names>A.</given-names></name> <name><surname>Marquez-Lago</surname> <given-names>T. T.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name><etal/></person-group> (<year>2018</year>). <article-title>iFeature: a python package and web server for features extraction and selection from protein and peptide sequences.</article-title> <source><italic>Bioinformatics</italic></source> <volume>34</volume> <fpage>2499</fpage>&#x2013;<lpage>2502</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bty140</pub-id> <pub-id pub-id-type="pmid">29528364</pub-id></citation></ref>
<ref id="B10"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chou</surname> <given-names>K. C.</given-names></name></person-group> (<year>2005</year>). <article-title>Using amphiphilic pseudo amino acid composition to predict enzyme subfamily classes.</article-title> <source><italic>Bioinformatics</italic></source> <volume>21</volume> <fpage>10</fpage>&#x2013;<lpage>19</lpage>.</citation></ref>
<ref id="B11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chou</surname> <given-names>K. C.</given-names></name></person-group> (<year>2001</year>). <article-title>Prediction of protein cellular attributes using pseudo-amino acid composition.</article-title> <source><italic>Proteins</italic></source> <volume>43</volume> <fpage>246</fpage>&#x2013;<lpage>255</lpage>. <pub-id pub-id-type="doi">10.1002/prot.1035</pub-id> <pub-id pub-id-type="pmid">11288174</pub-id></citation></ref>
<ref id="B12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dao</surname> <given-names>F. Y.</given-names></name> <name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Su</surname> <given-names>W.</given-names></name> <name><surname>Sun</surname> <given-names>Z. J.</given-names></name> <name><surname>Huang</surname> <given-names>Q. L.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name></person-group> (<year>2021a</year>). <article-title>iDHS-deep: an integrated tool for predicting DNase I hypersensitive sites by deep neural network.</article-title> <source><italic>Brief. Bioinform.</italic></source> <volume>22</volume>:<fpage>bbab047</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbab047</pub-id> <pub-id pub-id-type="pmid">33751027</pub-id></citation></ref>
<ref id="B13"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dao</surname> <given-names>F. Y.</given-names></name> <name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Zhang</surname> <given-names>D.</given-names></name> <name><surname>Zhang</surname> <given-names>Z. M.</given-names></name> <name><surname>Liu</surname> <given-names>L.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name></person-group> (<year>2021b</year>). <article-title>DeepYY1: a deep learning approach to identify YY1-mediated chromatin loops.</article-title> <source><italic>Brief. Bioinform.</italic></source> <volume>22</volume>:<fpage>bbaa356</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbaa356</pub-id> <pub-id pub-id-type="pmid">33279983</pub-id></citation></ref>
<ref id="B14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dao</surname> <given-names>F. Y.</given-names></name> <name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Zulfiqar</surname> <given-names>H.</given-names></name> <name><surname>Yang</surname> <given-names>H.</given-names></name> <name><surname>Su</surname> <given-names>W.</given-names></name> <name><surname>Gao</surname> <given-names>H.</given-names></name><etal/></person-group> (<year>2021c</year>). <article-title>A computational platform to identify origins of replication sites in eukaryotes.</article-title> <source><italic>Brief. Bioinform.</italic></source> <volume>22</volume> <fpage>1940</fpage>&#x2013;<lpage>1950</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbaa017</pub-id> <pub-id pub-id-type="pmid">32065211</pub-id></citation></ref>
<ref id="B15"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ding</surname> <given-names>Y.</given-names></name> <name><surname>Cai</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>G.</given-names></name> <name><surname>Xu</surname> <given-names>W.</given-names></name></person-group> (<year>2004</year>). <article-title>The influence of dipeptide composition on protein thermostability.</article-title> <source><italic>FEBS Lett.</italic></source> <volume>569</volume> <fpage>284</fpage>&#x2013;<lpage>288</lpage>. <pub-id pub-id-type="doi">10.1016/j.febslet.2004.06.009</pub-id> <pub-id pub-id-type="pmid">15225649</pub-id></citation></ref>
<ref id="B16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dubchak</surname> <given-names>I.</given-names></name> <name><surname>Muchnik</surname> <given-names>I.</given-names></name> <name><surname>Mayor</surname> <given-names>C.</given-names></name> <name><surname>Dralyuk</surname> <given-names>I.</given-names></name> <name><surname>Kim</surname> <given-names>S. H.</given-names></name></person-group> (<year>1999</year>). <article-title>Recognition of a protein fold in the context of the SCOP classification.</article-title> <source><italic>Proteins</italic></source> <volume>35</volume> <fpage>401</fpage>&#x2013;<lpage>407</lpage>.</citation></ref>
<ref id="B17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fan</surname> <given-names>G. L.</given-names></name> <name><surname>Liu</surname> <given-names>Y. L.</given-names></name> <name><surname>Wang</surname> <given-names>H.</given-names></name></person-group> (<year>2016</year>). <article-title>Identification of thermophilic proteins by incorporating evolutionary and acid dissociation information into Chou&#x2019;s general pseudo amino acid composition.</article-title> <source><italic>J. Theor. Biol.</italic></source> <volume>407</volume> <fpage>138</fpage>&#x2013;<lpage>142</lpage>. <pub-id pub-id-type="doi">10.1016/j.jtbi.2016.07.010</pub-id> <pub-id pub-id-type="pmid">27396359</pub-id></citation></ref>
<ref id="B18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Feng</surname> <given-names>C.</given-names></name> <name><surname>Ma</surname> <given-names>Z.</given-names></name> <name><surname>Yang</surname> <given-names>D.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name></person-group> (<year>2020</year>). <article-title>A method for prediction of thermophilic protein based on reduced amino acids and mixed features.</article-title> <source><italic>Front. Bioeng. Biotechnol.</italic></source> <volume>8</volume>:<fpage>285</fpage>. <pub-id pub-id-type="doi">10.3389/fbioe.2020.00285</pub-id> <pub-id pub-id-type="pmid">32432088</pub-id></citation></ref>
<ref id="B19"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fukuchi</surname> <given-names>S.</given-names></name> <name><surname>Nishikawa</surname> <given-names>K.</given-names></name></person-group> (<year>2001</year>). <article-title>Protein surface amino acid compositions distinctively differ between thermophilic and mesophilic bacteria.</article-title> <source><italic>J. Mol. Biol.</italic></source> <volume>309</volume> <fpage>835</fpage>&#x2013;<lpage>843</lpage>. <pub-id pub-id-type="doi">10.1006/jmbi.2001.4718</pub-id> <pub-id pub-id-type="pmid">11399062</pub-id></citation></ref>
<ref id="B20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ge</surname> <given-names>M.</given-names></name> <name><surname>Xia</surname> <given-names>X. Y.</given-names></name> <name><surname>Pan</surname> <given-names>X. M.</given-names></name></person-group> (<year>2008</year>). <article-title>Salt bridges in the hyperthermophilic protein Ssh10b are resilient to temperature increases.</article-title> <source><italic>J. Biol. Chem.</italic></source> <volume>283</volume> <fpage>31690</fpage>&#x2013;<lpage>31696</lpage>. <pub-id pub-id-type="doi">10.1074/jbc.M805750200</pub-id> <pub-id pub-id-type="pmid">18779322</pub-id></citation></ref>
<ref id="B21"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gromiha</surname> <given-names>M. M.</given-names></name></person-group> (<year>2001</year>). <article-title>Important inter-residue contacts for enhancing the thermal stability of thermophilic proteins.</article-title> <source><italic>Biophys. Chem.</italic></source> <volume>91</volume> <fpage>71</fpage>&#x2013;<lpage>77</lpage>. <pub-id pub-id-type="doi">10.1016/s0301-4622(01)00154-5</pub-id> <pub-id pub-id-type="pmid">11403885</pub-id></citation></ref>
<ref id="B22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gromiha</surname> <given-names>M. M.</given-names></name> <name><surname>Ahmad</surname> <given-names>S.</given-names></name> <name><surname>Suwa</surname> <given-names>M.</given-names></name></person-group> (<year>2005</year>). <article-title>Application of residue distribution along the sequence for discriminating outer membrane proteins.</article-title> <source><italic>Comput Biol. Chem.</italic></source> <volume>29</volume> <fpage>135</fpage>&#x2013;<lpage>142</lpage>. <pub-id pub-id-type="doi">10.1016/j.compbiolchem.2005.02.006</pub-id> <pub-id pub-id-type="pmid">15833441</pub-id></citation></ref>
<ref id="B23"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gromiha</surname> <given-names>M. M.</given-names></name> <name><surname>Pathak</surname> <given-names>M. C.</given-names></name> <name><surname>Saraboji</surname> <given-names>K.</given-names></name> <name><surname>Ortlund</surname> <given-names>E. A.</given-names></name> <name><surname>Gaucher</surname> <given-names>E. A.</given-names></name></person-group> (<year>2013</year>). <article-title>Hydrophobic environment is a key factor for the stability of thermophilic proteins.</article-title> <source><italic>Proteins</italic></source> <volume>81</volume> <fpage>715</fpage>&#x2013;<lpage>721</lpage>. <pub-id pub-id-type="doi">10.1002/prot.24232</pub-id> <pub-id pub-id-type="pmid">23319168</pub-id></citation></ref>
<ref id="B24"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gromiha</surname> <given-names>M. M.</given-names></name> <name><surname>Suresh</surname> <given-names>M. X.</given-names></name></person-group> (<year>2008</year>). <article-title>Discrimination of mesophilic and thermophilic proteins using machine learning algorithms.</article-title> <source><italic>Proteins</italic></source> <volume>70</volume> <fpage>1274</fpage>&#x2013;<lpage>1279</lpage>. <pub-id pub-id-type="doi">10.1002/prot.21616</pub-id> <pub-id pub-id-type="pmid">17876820</pub-id></citation></ref>
<ref id="B25"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>P.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Zhao</surname> <given-names>Y.</given-names></name></person-group> (<year>2020</year>). <article-title>Discrimination of thermophilic proteins and non-thermophilic proteins using feature dimension reduction.</article-title> <source><italic>Front. Bioeng. Biotechnol</italic></source> <volume>8</volume>:<fpage>584807</fpage>. <pub-id pub-id-type="doi">10.3389/fbioe.2020.584807</pub-id> <pub-id pub-id-type="pmid">33195148</pub-id></citation></ref>
<ref id="B26"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ho Thanh Lam</surname> <given-names>L.</given-names></name> <name><surname>Le</surname> <given-names>N. H.</given-names></name> <name><surname>Van Tuan</surname> <given-names>L.</given-names></name> <name><surname>Tran Ban</surname> <given-names>H.</given-names></name> <name><surname>Nguyen Khanh Hung</surname> <given-names>T.</given-names></name> <name><surname>Nguyen</surname> <given-names>N. T. K.</given-names></name><etal/></person-group> (<year>2020</year>). <article-title>Machine learning model for identifying antioxidant proteins using features calculated from primary sequences.</article-title> <source><italic>Biology</italic></source> <volume>9</volume>:<fpage>325</fpage>. <pub-id pub-id-type="doi">10.3390/biology9100325</pub-id> <pub-id pub-id-type="pmid">33036150</pub-id></citation></ref>
<ref id="B27"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>H.</given-names></name> <name><surname>Gong</surname> <given-names>X.</given-names></name></person-group> (<year>2020</year>). <article-title>A review of protein inter-residue distance prediction.</article-title> <source><italic>Curr. Bioinform.</italic></source> <volume>15</volume> <fpage>821</fpage>&#x2013;<lpage>830</lpage>. <pub-id pub-id-type="doi">10.2174/1574893615999200425230056</pub-id></citation></ref>
<ref id="B28"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>Y.</given-names></name> <name><surname>Niu</surname> <given-names>B.</given-names></name> <name><surname>Gao</surname> <given-names>Y.</given-names></name> <name><surname>Fu</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>W.</given-names></name></person-group> (<year>2010</year>). <article-title>CD-HIT Suite: a web server for clustering and comparing biological sequences.</article-title> <source><italic>Bioinformatics</italic></source> <volume>26</volume> <fpage>680</fpage>&#x2013;<lpage>682</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btq003</pub-id> <pub-id pub-id-type="pmid">20053844</pub-id></citation></ref>
<ref id="B29"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jang</surname> <given-names>K. J.</given-names></name> <name><surname>Jeong</surname> <given-names>S.</given-names></name> <name><surname>Kang</surname> <given-names>D. Y.</given-names></name> <name><surname>Sp</surname> <given-names>N.</given-names></name> <name><surname>Yang</surname> <given-names>Y. M.</given-names></name> <name><surname>Kim</surname> <given-names>D. E.</given-names></name></person-group> (<year>2020</year>). <article-title>A high ATP concentration enhances the cooperative translocation of the SARS coronavirus helicase nsP13 in the unwinding of duplex RNA.</article-title> <source><italic>Sci. Rep.</italic></source> <volume>10</volume> <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1038/s41598-020-61432-1</pub-id> <pub-id pub-id-type="pmid">32161317</pub-id></citation></ref>
<ref id="B30"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jayaraman</surname> <given-names>S.</given-names></name> <name><surname>Gantz</surname> <given-names>D. L.</given-names></name> <name><surname>Gursky</surname> <given-names>O.</given-names></name></person-group> (<year>2006</year>). <article-title>Effects of salt on the thermal stability of human plasma high-density lipoprotein.</article-title> <source><italic>Biochemistry</italic></source> <volume>45</volume> <fpage>4620</fpage>&#x2013;<lpage>4628</lpage>. <pub-id pub-id-type="doi">10.1021/bi0524565</pub-id> <pub-id pub-id-type="pmid">16584197</pub-id></citation></ref>
<ref id="B31"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Joachims</surname> <given-names>T.</given-names></name></person-group> (<year>1998</year>). <source><italic>Making Large-scale SVM Learning Practical. Technical Report.</italic></source> <publisher-loc>Dortmund</publisher-loc>: <publisher-name>Technical University Dortmund</publisher-name>.</citation></ref>
<ref id="B32"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ju</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>S. Y.</given-names></name></person-group> (<year>2020</year>). <article-title>Prediction of lysine formylation sites using the composition of k-spaced amino acid pairs via Chou&#x2019;s 5-steps rule and general pseudo components.</article-title> <source><italic>Genomics</italic></source> <volume>112</volume> <fpage>859</fpage>&#x2013;<lpage>866</lpage>. <pub-id pub-id-type="doi">10.1016/j.ygeno.2019.05.027</pub-id> <pub-id pub-id-type="pmid">31175975</pub-id></citation></ref>
<ref id="B33"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kumar</surname> <given-names>S.</given-names></name> <name><surname>Tsai</surname> <given-names>C. J.</given-names></name> <name><surname>Nussinov</surname> <given-names>R.</given-names></name></person-group> (<year>2000</year>). <article-title>Factors enhancing protein thermostability.</article-title> <source><italic>Protein Eng.</italic></source> <volume>13</volume> <fpage>179</fpage>&#x2013;<lpage>191</lpage>. <pub-id pub-id-type="doi">10.1093/protein/13.3.179</pub-id> <pub-id pub-id-type="pmid">10775659</pub-id></citation></ref>
<ref id="B34"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>G.</given-names></name> <name><surname>Rabe</surname> <given-names>K. S.</given-names></name> <name><surname>Nielsen</surname> <given-names>J.</given-names></name> <name><surname>Engqvist</surname> <given-names>M. K.</given-names></name></person-group> (<year>2019</year>). <article-title>Machine learning applied to predicting microorganism growth temperatures and enzyme catalytic optima.</article-title> <source><italic>ACS Synth. Biol.</italic></source> <volume>8</volume> <fpage>1411</fpage>&#x2013;<lpage>1420</lpage>. <pub-id pub-id-type="doi">10.1021/acssynbio.9b00099</pub-id> <pub-id pub-id-type="pmid">31117361</pub-id></citation></ref>
<ref id="B35"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>H. L.</given-names></name> <name><surname>Pang</surname> <given-names>Y. H.</given-names></name> <name><surname>Liu</surname> <given-names>B.</given-names></name></person-group> (<year>2021</year>). <article-title>BioSeq-BLM: a platform for analyzing DNA, RNA and protein sequences based on biological language models.</article-title> <source><italic>Nucleic Acids Res.</italic></source> <volume>49</volume>:<fpage>e129</fpage>. <pub-id pub-id-type="doi">10.1093/nar/gkab829</pub-id> <pub-id pub-id-type="pmid">34581805</pub-id></citation></ref>
<ref id="B36"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Zhu</surname> <given-names>P.</given-names></name> <name><surname>Zou</surname> <given-names>Q.</given-names></name></person-group> (<year>2019</year>). &#x201C;<article-title>Prediction of thermophilic proteins using voting algorithm</article-title>,&#x201D; in <source><italic>Proceedings of the International Work-Conference on Bioinformatics and Biomedical Engineering</italic></source> (<publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>195</fpage>&#x2013;<lpage>203</lpage>.</citation></ref>
<ref id="B37"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>W.</given-names></name></person-group> (<year>2011</year>). <article-title>Prediction of thermophilic proteins using feature selection technique.</article-title> <source><italic>J. Microbiol. Methods</italic></source> <volume>84</volume> <fpage>67</fpage>&#x2013;<lpage>70</lpage>. <pub-id pub-id-type="doi">10.1016/j.mimet.2010.10.013</pub-id> <pub-id pub-id-type="pmid">21044646</pub-id></citation></ref>
<ref id="B38"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>B.</given-names></name> <name><surname>Gao</surname> <given-names>X.</given-names></name> <name><surname>Zhang</surname> <given-names>H.</given-names></name></person-group> (<year>2019</year>). <article-title>BioSeq-Analysis2.0: an updated platform for analyzing DNA, RNA and protein sequences at sequence level and residue level based on machine learning approaches.</article-title> <source><italic>Nucleic Acids Res.</italic></source> <volume>47</volume>:<fpage>e127</fpage>. <pub-id pub-id-type="doi">10.1093/nar/gkz740</pub-id> <pub-id pub-id-type="pmid">31504851</pub-id></citation></ref>
<ref id="B39"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>M. L.</given-names></name> <name><surname>Su</surname> <given-names>W.</given-names></name> <name><surname>Wang</surname> <given-names>J. S.</given-names></name> <name><surname>Yang</surname> <given-names>Y. H.</given-names></name> <name><surname>Yang</surname> <given-names>H.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name></person-group> (<year>2020</year>). <article-title>Predicting preference of transcription factors for methylated DNA using sequence information.</article-title> <source><italic>Mol. Ther.</italic></source> <volume>22</volume> <fpage>1043</fpage>&#x2013;<lpage>1050</lpage>. <pub-id pub-id-type="doi">10.1016/j.omtn.2020.07.035</pub-id> <pub-id pub-id-type="pmid">33294291</pub-id></citation></ref>
<ref id="B40"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>X. L.</given-names></name> <name><surname>Lu</surname> <given-names>J. L.</given-names></name> <name><surname>Hu</surname> <given-names>X. H.</given-names></name></person-group> (<year>2011</year>). <article-title>Predicting thermophilic proteins with pseudo amino acid composition: approached from chaos game representation and principal component analysis.</article-title> <source><italic>Protein Pept. Lett.</italic></source> <volume>18</volume> <fpage>1244</fpage>&#x2013;<lpage>1250</lpage>. <pub-id pub-id-type="doi">10.2174/092986611797642661</pub-id> <pub-id pub-id-type="pmid">21787282</pub-id></citation></ref>
<ref id="B41"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Dao</surname> <given-names>F. Y.</given-names></name> <name><surname>Guan</surname> <given-names>Z. X.</given-names></name> <name><surname>Yang</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>Y. W.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name></person-group> (<year>2021</year>). <article-title>Deep-Kcr: accurate detection of lysine crotonylation sites using deep learning method.</article-title> <source><italic>Brief. Bioinform.</italic></source> <volume>22</volume>:<fpage>bbaa255</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbaa255</pub-id> <pub-id pub-id-type="pmid">33099604</pub-id></citation></ref>
<ref id="B42"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Dao</surname> <given-names>F. Y.</given-names></name> <name><surname>Zhang</surname> <given-names>D.</given-names></name> <name><surname>Guan</surname> <given-names>Z. X.</given-names></name> <name><surname>Yang</surname> <given-names>H.</given-names></name> <name><surname>Su</surname> <given-names>W.</given-names></name><etal/></person-group> (<year>2020a</year>). <article-title>iDNA-MS: an integrated computational tool for detecting DNA modification sites in multiple genomes.</article-title> <source><italic>iScience</italic></source> <volume>23</volume>:<fpage>100991</fpage>. <pub-id pub-id-type="doi">10.1016/j.isci.2020.100991</pub-id> <pub-id pub-id-type="pmid">32240948</pub-id></citation></ref>
<ref id="B43"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lv</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>D.</given-names></name> <name><surname>Ding</surname> <given-names>H.</given-names></name> <name><surname>Zhong</surname> <given-names>B.</given-names></name> <name><surname>Xu</surname> <given-names>L.</given-names></name></person-group> (<year>2020b</year>). <article-title><italic>Escherichia coli</italic> DNA N-4-methycytosine site prediction accuracy improved by light gradient boosting machine feature selection technology.</article-title> <source><italic>IEEE Access.</italic></source> <volume>8</volume> <fpage>14851</fpage>&#x2013;<lpage>14859</lpage>. <pub-id pub-id-type="doi">10.1109/access.2020.2966576</pub-id></citation></ref>
<ref id="B44"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lv</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>P.</given-names></name> <name><surname>Zou</surname> <given-names>Q.</given-names></name> <name><surname>Jiang</surname> <given-names>Q.</given-names></name></person-group> (<year>2020c</year>). <article-title>Identification of sub-Golgi protein localization by use of deep representation learning features.</article-title> <source><italic>Bioinformatics</italic></source> <volume>36</volume> <fpage>5600</fpage>&#x2013;<lpage>5609</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa1074</pub-id> <pub-id pub-id-type="pmid">33367627</pub-id></citation></ref>
<ref id="B45"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lv</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Ding</surname> <given-names>H.</given-names></name> <name><surname>Zou</surname> <given-names>Q.</given-names></name></person-group> (<year>2020d</year>). <article-title>RF-PseU: a random forest predictor for RNA pseudouridine sites.</article-title> <source><italic>Front. Bioeng. Biotechnol.</italic></source> <volume>8</volume>:<fpage>134</fpage>. <pub-id pub-id-type="doi">10.3389/fbioe.2020.00134</pub-id> <pub-id pub-id-type="pmid">32175316</pub-id></citation></ref>
<ref id="B46"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lv</surname> <given-names>Z.</given-names></name> <name><surname>Cui</surname> <given-names>F.</given-names></name> <name><surname>Zou</surname> <given-names>Q.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Xu</surname> <given-names>L.</given-names></name></person-group> (<year>2021</year>). <article-title>Anticancer peptides prediction with deep representation learning features.</article-title> <source><italic>Brief. Bioinform.</italic></source> <volume>22</volume>:<fpage>bbab008</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbab008</pub-id> <pub-id pub-id-type="pmid">33529337</pub-id></citation></ref>
<ref id="B47"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mahmoudi</surname> <given-names>M.</given-names></name> <name><surname>Arab</surname> <given-names>A.</given-names></name> <name><surname>Zahiri</surname> <given-names>J.</given-names></name> <name><surname>Parandian</surname> <given-names>Y.</given-names></name></person-group> (<year>2016</year>). <article-title>An overview of the protein thermostability prediction: databases and tools.</article-title> <source><italic>J. Nanomed. Res.</italic></source> <volume>3</volume>:<fpage>00072</fpage>.</citation></ref>
<ref id="B48"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Meruelo</surname> <given-names>A. D.</given-names></name> <name><surname>Han</surname> <given-names>S. K.</given-names></name> <name><surname>Kim</surname> <given-names>S.</given-names></name> <name><surname>Bowie</surname> <given-names>J. U.</given-names></name></person-group> (<year>2012</year>). <article-title>Structural differences between thermophilic and mesophilic membrane proteins.</article-title> <source><italic>Protein Sci.</italic></source> <volume>21</volume> <fpage>1746</fpage>&#x2013;<lpage>1753</lpage>. <pub-id pub-id-type="doi">10.1002/pro.2157</pub-id> <pub-id pub-id-type="pmid">23001966</pub-id></citation></ref>
<ref id="B49"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Miyazaki</surname> <given-names>K.</given-names></name> <name><surname>Takenouchi</surname> <given-names>M.</given-names></name> <name><surname>Kondo</surname> <given-names>H.</given-names></name> <name><surname>Noro</surname> <given-names>N.</given-names></name> <name><surname>Suzuki</surname> <given-names>M.</given-names></name> <name><surname>Tsuda</surname> <given-names>S.</given-names></name></person-group> (<year>2006</year>). <article-title>Thermal stabilization of <italic>Bacillus subtilis</italic> family-11 xylanase by directed evolution.</article-title> <source><italic>J. Biol. Chem.</italic></source> <volume>281</volume> <fpage>10236</fpage>&#x2013;<lpage>10242</lpage>. <pub-id pub-id-type="doi">10.1074/jbc.M511948200</pub-id> <pub-id pub-id-type="pmid">16467302</pub-id></citation></ref>
<ref id="B50"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nakariyakul</surname> <given-names>S.</given-names></name> <name><surname>Liu</surname> <given-names>Z. P.</given-names></name> <name><surname>Chen</surname> <given-names>L.</given-names></name></person-group> (<year>2012</year>). <article-title>Detecting thermophilic proteins through selecting amino acid and dipeptide composition features.</article-title> <source><italic>Amino Acids</italic></source> <volume>42</volume> <fpage>1947</fpage>&#x2013;<lpage>1953</lpage>. <pub-id pub-id-type="doi">10.1007/s00726-011-0923-1</pub-id> <pub-id pub-id-type="pmid">21547362</pub-id></citation></ref>
<ref id="B51"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Panja</surname> <given-names>A. S.</given-names></name> <name><surname>Maiti</surname> <given-names>S.</given-names></name> <name><surname>Bandyopadhyay</surname> <given-names>B.</given-names></name></person-group> (<year>2020</year>). <article-title>Protein stability governed by its structural plasticity is inferred by physicochemical factors and salt bridges.</article-title> <source><italic>Sci. Rep.</italic></source> <volume>10</volume> <fpage>1</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1038/s41598-020-58825-7</pub-id> <pub-id pub-id-type="pmid">32020026</pub-id></citation></ref>
<ref id="B52"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sadeghi</surname> <given-names>M.</given-names></name> <name><surname>Naderi-Manesh</surname> <given-names>H.</given-names></name> <name><surname>Zarrabi</surname> <given-names>M.</given-names></name> <name><surname>Ranjbar</surname> <given-names>B.</given-names></name></person-group> (<year>2006</year>). <article-title>Effective factors in thermostability of thermophilic proteins.</article-title> <source><italic>Biophys. Chem.</italic></source> <volume>119</volume> <fpage>256</fpage>&#x2013;<lpage>270</lpage>. <pub-id pub-id-type="doi">10.1016/j.bpc.2005.09.018</pub-id> <pub-id pub-id-type="pmid">16253416</pub-id></citation></ref>
<ref id="B53"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Saraboji</surname> <given-names>K.</given-names></name> <name><surname>Gromiha</surname> <given-names>M. M.</given-names></name> <name><surname>Ponnuswamy</surname> <given-names>M.</given-names></name></person-group> (<year>2005</year>). <article-title>Importance of main-chain hydrophobic free energy to the stability of thermophilic proteins.</article-title> <source><italic>Int. J. Biol.</italic></source> <volume>35</volume> <fpage>211</fpage>&#x2013;<lpage>220</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijbiomac.2005.02.003</pub-id> <pub-id pub-id-type="pmid">15811476</pub-id></citation></ref>
<ref id="B54"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Saravanan</surname> <given-names>V.</given-names></name> <name><surname>Gautham</surname> <given-names>N.</given-names></name></person-group> (<year>2015</year>). <article-title>Harnessing computational biology for exact linear B-cell epitope prediction: a novel amino acid composition-based feature descriptor.</article-title> <source><italic>OMICS</italic></source> <volume>19</volume> <fpage>648</fpage>&#x2013;<lpage>658</lpage>. <pub-id pub-id-type="doi">10.1089/omi.2015.0095</pub-id> <pub-id pub-id-type="pmid">26406767</pub-id></citation></ref>
<ref id="B55"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shao</surname> <given-names>J.</given-names></name> <name><surname>Yan</surname> <given-names>K.</given-names></name> <name><surname>Liu</surname> <given-names>B.</given-names></name></person-group> (<year>2021</year>). <article-title>FoldRec-C2C: protein fold recognition by combining cluster-to-cluster model and protein similarity network.</article-title> <source><italic>Brief. Bioinform.</italic></source> <volume>22</volume>:<fpage>bbaa144</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbaa144</pub-id> <pub-id pub-id-type="pmid">32685972</pub-id></citation></ref>
<ref id="B56"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Suresh</surname> <given-names>N. T.</given-names></name> <name><surname>Ravindran</surname> <given-names>V. E.</given-names></name> <name><surname>Krishnakumar</surname> <given-names>U.</given-names></name></person-group> (<year>2021</year>). <article-title>A computational framework to identify cross association between complex disorders by protein-protein interaction network analysis.</article-title> <source><italic>Curr. Bioinform.</italic></source> <volume>16</volume> <fpage>433</fpage>&#x2013;<lpage>434</lpage>. <pub-id pub-id-type="doi">10.2174/1574893615999200724145434</pub-id></citation></ref>
<ref id="B57"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Zhao</surname> <given-names>Y. W.</given-names></name> <name><surname>Zou</surname> <given-names>P.</given-names></name> <name><surname>Zhang</surname> <given-names>C. M.</given-names></name> <name><surname>Chen</surname> <given-names>R.</given-names></name> <name><surname>Huang</surname> <given-names>P.</given-names></name><etal/></person-group> (<year>2018</year>). <article-title>HBPred: a tool to identify growth hormone-binding proteins.</article-title> <source><italic>Int. J. Biol. Sci.</italic></source> <volume>14</volume>:<fpage>957</fpage>. <pub-id pub-id-type="doi">10.7150/ijbs.24174</pub-id> <pub-id pub-id-type="pmid">29989085</pub-id></citation></ref>
<ref id="B58"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Cao</surname> <given-names>R. Z.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Liu</surname> <given-names>T. S.</given-names></name> <name><surname>Wang</surname> <given-names>L. M.</given-names></name> <name><surname>He</surname> <given-names>C. M.</given-names></name></person-group> (<year>2017</year>). <article-title>A two-step discriminated method to identify thermophilic proteins.</article-title> <source><italic>Int. J. Biomath.</italic></source> <volume>10</volume>:<fpage>1750050</fpage>. <pub-id pub-id-type="doi">10.1142/s1793524517500504</pub-id></citation></ref>
<ref id="B59"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>Y. J.</given-names></name> <name><surname>Pang</surname> <given-names>Y. H.</given-names></name> <name><surname>Liu</surname> <given-names>B.</given-names></name></person-group> (<year>2020</year>). <article-title>IDP-Seq2Seq: identification of intrinsically disordered regions based on sequence to sequence learning.</article-title> <source><italic>Bioinformatics</italic></source> <volume>36</volume> <fpage>5177</fpage>&#x2013;<lpage>5186</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa667</pub-id> <pub-id pub-id-type="pmid">32702119</pub-id></citation></ref>
<ref id="B60"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Taud</surname> <given-names>H.</given-names></name> <name><surname>Mas</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <source><italic>Multilayer Perceptron (MLP), Geomatic Approaches for Modeling Land Change Scenarios.</italic></source> <publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>, <fpage>451</fpage>&#x2013;<lpage>455</lpage>.</citation></ref>
<ref id="B61"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tomii</surname> <given-names>K.</given-names></name> <name><surname>Kanehisa</surname> <given-names>M.</given-names></name></person-group> (<year>1996</year>). <article-title>Analysis of amino acid indices and mutation matrices for sequence comparison and structure prediction of proteins.</article-title> <source><italic>Protein Eng.</italic></source> <volume>9</volume> <fpage>27</fpage>&#x2013;<lpage>36</lpage>. <pub-id pub-id-type="doi">10.1093/protein/9.1.27</pub-id> <pub-id pub-id-type="pmid">9053899</pub-id></citation></ref>
<ref id="B62"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Uddin</surname> <given-names>S.</given-names></name> <name><surname>Khan</surname> <given-names>A.</given-names></name> <name><surname>Hossain</surname> <given-names>M. E.</given-names></name> <name><surname>Moni</surname> <given-names>M. A.</given-names></name></person-group> (<year>2019</year>). <article-title>Comparing different supervised machine learning algorithms for disease prediction.</article-title> <source><italic>BMC Med. Inform. Decis. Mak.</italic></source> <volume>19</volume>:<fpage>281</fpage>. <pub-id pub-id-type="doi">10.1186/s12911-019-1004-8</pub-id> <pub-id pub-id-type="pmid">31864346</pub-id></citation></ref>
<ref id="B63"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>D.</given-names></name> <name><surname>Yang</surname> <given-names>L.</given-names></name> <name><surname>Fu</surname> <given-names>Z.</given-names></name> <name><surname>Xia</surname> <given-names>J.</given-names></name></person-group> (<year>2011</year>). <article-title>Prediction of thermophilic protein with pseudo amino acid composition: an approach from combined feature selection and reduction.</article-title> <source><italic>Protein Pept. Lett.</italic></source> <volume>18</volume> <fpage>684</fpage>&#x2013;<lpage>689</lpage>. <pub-id pub-id-type="doi">10.2174/092986611795446085</pub-id> <pub-id pub-id-type="pmid">21413920</pub-id></citation></ref>
<ref id="B64"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>D.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Jiang</surname> <given-names>Y.</given-names></name> <name><surname>Mao</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>D.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name><etal/></person-group> (<year>2021</year>). <article-title>DM3Loc: multi-label mRNA subcellular localization prediction and analysis based on multi-head self-attention mechanism.</article-title> <source><italic>Nucleic Acids Res.</italic></source> <volume>49</volume>:<fpage>e46</fpage>. <pub-id pub-id-type="doi">10.1093/nar/gkab016</pub-id> <pub-id pub-id-type="pmid">33503258</pub-id></citation></ref>
<ref id="B65"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>X. F.</given-names></name> <name><surname>Gao</surname> <given-names>P.</given-names></name> <name><surname>Liu</surname> <given-names>Y. F.</given-names></name> <name><surname>Li</surname> <given-names>H. F.</given-names></name> <name><surname>Lu</surname> <given-names>F.</given-names></name></person-group> (<year>2020</year>). <article-title>Predicting thermophilic proteins by machine learning.</article-title> <source><italic>Curr. Bioinform.</italic></source> <volume>15</volume> <fpage>493</fpage>&#x2013;<lpage>502</lpage>. <pub-id pub-id-type="doi">10.2174/1574893615666200207094357</pub-id></citation></ref>
<ref id="B66"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>D.</given-names></name> <name><surname>Xu</surname> <given-names>Z.-C.</given-names></name> <name><surname>Su</surname> <given-names>W.</given-names></name> <name><surname>Yang</surname> <given-names>Y. H.</given-names></name> <name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Yang</surname> <given-names>H.</given-names></name><etal/></person-group> (<year>2021b</year>). <article-title>iCarPS: a computational tool for identifying protein carbonylation sites by novel encoded features.</article-title> <source><italic>Bioinformatics</italic></source> <volume>37</volume> <fpage>171</fpage>&#x2013;<lpage>177</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa702</pub-id> <pub-id pub-id-type="pmid">32766811</pub-id></citation></ref>
<ref id="B67"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>D.</given-names></name> <name><surname>Chen</surname> <given-names>H.-D.</given-names></name> <name><surname>Zulfiqar</surname> <given-names>H.</given-names></name> <name><surname>Yuan</surname> <given-names>S. S.</given-names></name> <name><surname>Huang</surname> <given-names>Q. L.</given-names></name> <name><surname>Zhang</surname> <given-names>Z. Y.</given-names></name><etal/></person-group> (<year>2021a</year>). <article-title>iBLP: an XGBoost-based predictor for identifying bioluminescent proteins.</article-title> <source><italic>Comput. Math. Methods Med.</italic></source> <volume>2021</volume>:<fpage>6664362</fpage>. <pub-id pub-id-type="doi">10.1155/2021/6664362</pub-id> <pub-id pub-id-type="pmid">33505515</pub-id></citation></ref>
<ref id="B68"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>G.</given-names></name> <name><surname>Fang</surname> <given-names>B.</given-names></name></person-group> (<year>2007</year>). <article-title>LogitBoost classifier for discriminating thermophilic and mesophilic proteins.</article-title> <source><italic>J. Biotechnol.</italic></source> <volume>127</volume> <fpage>417</fpage>&#x2013;<lpage>424</lpage>. <pub-id pub-id-type="doi">10.1016/j.jbiotec.2006.07.020</pub-id> <pub-id pub-id-type="pmid">17045354</pub-id></citation></ref>
<ref id="B69"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Dong</surname> <given-names>B.</given-names></name> <name><surname>Teng</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Juan</surname> <given-names>L.</given-names></name></person-group> (<year>2020</year>). <article-title>Identification of human enzymes using amino acid composition and the composition of-spaced amino acid pairs.</article-title> <source><italic>Biomed. Res. Int.</italic></source> <pub-id pub-id-type="doi">10.1155/2020/9235920</pub-id> <pub-id pub-id-type="pmid">32596396</pub-id></citation></ref>
<ref id="B70"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Z. M.</given-names></name> <name><surname>Wang</surname> <given-names>J. S.</given-names></name> <name><surname>Zulfiqar</surname> <given-names>H.</given-names></name> <name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Dao</surname> <given-names>F. Y.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name></person-group> (<year>2020</year>). <article-title>Early diagnosis of pancreatic ductal adenocarcinoma by combining relative expression orderings with machine-learning method.</article-title> <source><italic>Front. Cell Dev. Biol.</italic></source> <volume>8</volume>:<fpage>582864</fpage>. <pub-id pub-id-type="doi">10.3389/fcell.2020.582864</pub-id> <pub-id pub-id-type="pmid">33178697</pub-id></citation></ref>
<ref id="B71"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>X. X.</given-names></name> <name><surname>Wang</surname> <given-names>Y. B.</given-names></name> <name><surname>Pan</surname> <given-names>Y. J.</given-names></name> <name><surname>Li</surname> <given-names>W. F.</given-names></name></person-group> (<year>2008</year>). <article-title>Differences in amino acids composition and coupling patterns between mesophilic and thermophilic proteins.</article-title> <source><italic>Amino Acids</italic></source> <volume>34</volume> <fpage>25</fpage>&#x2013;<lpage>33</lpage>. <pub-id pub-id-type="doi">10.1007/s00726-007-0589-x</pub-id> <pub-id pub-id-type="pmid">17710363</pub-id></citation></ref>
<ref id="B72"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zou</surname> <given-names>Y.</given-names></name> <name><surname>Wu</surname> <given-names>H.</given-names></name> <name><surname>Guo</surname> <given-names>X.</given-names></name> <name><surname>Peng</surname> <given-names>L.</given-names></name> <name><surname>Ding</surname> <given-names>Y.</given-names></name> <name><surname>Tang</surname> <given-names>J.</given-names></name><etal/></person-group> (<year>2021</year>). <article-title>MK-FSVM-SVDD: a multiple kernel-based fuzzy SVM model for predicting DNA-binding proteins via support vector data description.</article-title> <source><italic>Curr. Bioinform.</italic></source> <volume>16</volume> <fpage>274</fpage>&#x2013;<lpage>283</lpage>. <pub-id pub-id-type="doi">10.2174/1574893615999200607173829</pub-id></citation></ref>
<ref id="B73"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zulfiqar</surname> <given-names>H.</given-names></name> <name><surname>Huang</surname> <given-names>Q.-L.</given-names></name> <name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Sun</surname> <given-names>Z.-J.</given-names></name> <name><surname>Dao</surname> <given-names>F.-Y.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name></person-group> (<year>2022</year>). <article-title>Deep-4mCGP: a deep learning approach to predict 4mC sites in <italic>Geobacter pickeringii</italic> by using correlation-based feature selection technique.</article-title> <source><italic>Int. J. Mol. Sci.</italic></source> <volume>23</volume>:<fpage>1251</fpage>. <pub-id pub-id-type="doi">10.3390/ijms23031251</pub-id> <pub-id pub-id-type="pmid">35163174</pub-id></citation></ref>
<ref id="B74"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zulfiqar</surname> <given-names>H.</given-names></name> <name><surname>Sun</surname> <given-names>Z.-J.</given-names></name> <name><surname>Huang</surname> <given-names>Q. L.</given-names></name> <name><surname>Yuan</surname> <given-names>S. S.</given-names></name> <name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Dao</surname> <given-names>F. Y.</given-names></name><etal/></person-group> (<year>2021a</year>). <article-title>Deep-4mCW2V: a sequence-based predictor to identify N4-methylcytosine sites in <italic>Escherichia coli</italic>.</article-title> <source><italic>Methods</italic></source> <volume>S1046&#x2013;2023</volume>, <fpage>00198</fpage>&#x2013;<lpage>5</lpage>. <pub-id pub-id-type="doi">10.1016/j.ymeth.2021.07.011</pub-id> <pub-id pub-id-type="pmid">34352373</pub-id></citation></ref>
<ref id="B75"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zulfiqar</surname> <given-names>H.</given-names></name> <name><surname>Yuan</surname> <given-names>S. S.</given-names></name> <name><surname>Huang</surname> <given-names>Q. L.</given-names></name> <name><surname>Sun</surname> <given-names>Z. J.</given-names></name> <name><surname>Dao</surname> <given-names>F. Y.</given-names></name> <name><surname>Yu</surname> <given-names>X. L.</given-names></name><etal/></person-group> (<year>2021b</year>). <article-title>Identification of cyclin protein using gradient boost decision tree algorithm.</article-title> <source><italic>Comput. Struct. Biotechnol. J.</italic></source> <volume>19</volume> <fpage>4123</fpage>&#x2013;<lpage>4131</lpage>. <pub-id pub-id-type="doi">10.1016/j.csbj.2021.07.013</pub-id> <pub-id pub-id-type="pmid">34527186</pub-id></citation></ref>
<ref id="B76"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zuo</surname> <given-names>Y. C.</given-names></name> <name><surname>Chen</surname> <given-names>W.</given-names></name> <name><surname>Fan</surname> <given-names>G. L.</given-names></name> <name><surname>Li</surname> <given-names>Q. Z.</given-names></name></person-group> (<year>2013</year>). <article-title>A similarity distance of diversity measure for discriminating mesophilic and thermophilic proteins.</article-title> <source><italic>Amino Acids</italic></source> <volume>44</volume> <fpage>573</fpage>&#x2013;<lpage>580</lpage>. <pub-id pub-id-type="doi">10.1007/s00726-012-1374-z</pub-id> <pub-id pub-id-type="pmid">22851052</pub-id></citation></ref>
</ref-list>
</back>
</article>