<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">797641</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2021.797641</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Prediction of Hormone-Binding Proteins Based on K-mer Feature Representation and Naive Bayes</article-title>
<alt-title alt-title-type="left-running-head">Guo et&#x20;al.</alt-title>
<alt-title alt-title-type="right-running-head">Protein Classification</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Guo</surname>
<given-names>Yuxin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1521820/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hou</surname>
<given-names>Liping</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhu</surname>
<given-names>Wen</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/636955/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Peng</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<label>
<sup>1</sup>
</label>Key Laboratory of Computational Science and Application of Hainan Province, <addr-line>Haikou</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<label>
<sup>2</sup>
</label>Yangtze Delta Region Institute, University of Electronic Science and Technology of China, <addr-line>Quzhou</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<label>
<sup>3</sup>
</label>Key Laboratory of Data Science and Intelligence Education, Hainan Normal University, Ministry of Education, <addr-line>Haikou</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<label>
<sup>4</sup>
</label>School of Mathematics and Statistics, Hainan Normal University, <addr-line>Haikou</addr-line>, <country>China</country>
</aff>
<aff id="aff5">
<label>
<sup>5</sup>
</label>Beidahuang Industry Group General Hospital, <addr-line>Harbin</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/531759/overview">Quan Zou</ext-link>, University of Electronic Science and Technology of China, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1524185/overview">Chunyan Ao</ext-link>, Xidian University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1524286/overview">Ru Xiaoqing</ext-link>, University of Tsukuba, Japan</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Wen Zhu, <email>syzhuwen@163.com</email>
</corresp>
<fn fn-type="equal" id="fn1">
<label>
<sup>&#x2020;</sup>
</label>
<p>These authors have contributed equally to this&#x20;work</p>
</fn>
<fn fn-type="other">
<p>This article was submitted to Computational Genomics, a section of the journal Frontiers in Genetics</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>23</day>
<month>11</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>12</volume>
<elocation-id>797641</elocation-id>
<history>
<date date-type="received">
<day>19</day>
<month>10</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>05</day>
<month>11</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2021 Guo, Hou, Zhu and Wang.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Guo, Hou, Zhu and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these&#x20;terms.</p>
</license>
</permissions>
<abstract>
<p>Hormone binding protein (HBP) is a soluble carrier protein that interacts selectively with different types of hormones and has various effects on the body&#x2019;s life activities. HBPs play an important role in the growth process of organisms, but their specific role is still unclear. Therefore, correctly identifying HBPs is the first step towards understanding and studying their biological function. However, due to their high cost and long experimental period, it is difficult for traditional biochemical experiments to correctly identify HBPs from an increasing number of proteins, so the real characterization of HBPs has become a challenging task for researchers. To measure the effectiveness of HBPs, an accurate and reliable prediction model for their identification is desirable. In this paper, we construct the prediction model HBP_NB. First, HBPs data were collected from the UniProt database, and a dataset was established. Then, based on the established high-quality dataset, the k-mer (K &#x3d; 3) feature representation method was used to extract features. Second, the feature selection algorithm was used to reduce the dimensionality of the extracted features and select the appropriate optimal feature set. Finally, the selected features are input into Naive Bayes to construct the prediction model, and the model is evaluated by using 10-fold cross-validation. The final results were 95.45% accuracy, 94.17% sensitivity and 96.73% specificity. These results indicate that our model is feasible and effective.</p>
</abstract>
<kwd-group>
<kwd>hormone binding protein</kwd>
<kwd>feature selection</kwd>
<kwd>protein classification</kwd>
<kwd>k-mer</kwd>
<kwd>naive Bayes model</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>With the rapid development of society, people have higher and higher requirements for medical and health care (<xref ref-type="bibr" rid="B29">Lin, 2020</xref>). Therefore, it is urgent to learn more about the structure and function of proteins in order to explain more of the meaning of life and promote the development of biomedicine and other fields (<xref ref-type="bibr" rid="B57">Wang et&#x20;al., 2020a</xref>; <xref ref-type="bibr" rid="B44">Qu et&#x20;al., 2021</xref>). However, there is a difficulty in the current research, that is, how to use its sequence information to predict proteins effectively. Although effective prediction of protein sequences can be made using physical, chemical and biological experiments, these methods are costly and time consuming.</p>
<p>Hormone binding proteins (HBPs) are carrier proteins that bind specifically to targeted hormones and were first identified in the plasma of pregnant mice, rabbits and humans (<xref ref-type="bibr" rid="B39">Mortezaeefar et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B40">Niu et&#x20;al., 2021a</xref>). They are involved in hormonal regulation in living organisms. HBPs not only regulate the amount of hormones reaching the target cell to produce the desired effect (<xref ref-type="bibr" rid="B56">Wang et&#x20;al., 2018</xref>) but also regulate non-protein-binding or free-circulating active steroid hormones, which are thought to be the main gatekeepers of steroid effects. Sexual HBPs, mainly produced in the liver, combine with sexual steroid hormones to regulate their bioavailability. The incorrect expression of HBPs, however, can cause various diseases (<xref ref-type="bibr" rid="B52">Tan et&#x20;al., 2019</xref>).</p>
<p>Therefore, understanding the function and regulatory mechanism of HBPs has become very important. Accurately identifying HBPs is the first step in studying their function. Traditional HBPs identification methods involve wet biochemical experiments, such as immunoprecipitation, chromatography, or cross-linking (<xref ref-type="bibr" rid="B50">Sohm et&#x20;al., 1998</xref>; <xref ref-type="bibr" rid="B68">Zhang and Marchant, 1999</xref>; <xref ref-type="bibr" rid="B26">Einarsd&#xf3;ttir et&#x20;al., 2014</xref>; <xref ref-type="bibr" rid="B8">Cheng et&#x20;al., 2016</xref>; <xref ref-type="bibr" rid="B14">Fang et&#x20;al., 2019</xref>). These experimental methods are time-consuming and expensive, and with the discovery of a large number of protein sequences, it is difficult to determine HBPs through biochemical experiments. Therefore, it is necessary to establish an effective recognition model to identify HBPs (<xref ref-type="bibr" rid="B1">Akbar et&#x20;al., 2020</xref>). The description of the characteristics of the protein sequence method contains a lot of information, such as the chemical and physical properties of amino acids, sequence characteristics, feature extraction algorithm for classification algorithm which has great impact on the design and the classification of results. Generally, prediction techniques based on machine learning consist of three steps: feature extraction, construction of predictors, and performance evaluation (<xref ref-type="bibr" rid="B37">Liu, 2017</xref>; <xref ref-type="bibr" rid="B56">Wang et&#x20;al., 2018</xref>; <xref ref-type="bibr" rid="B67">Zhang et&#x20;al., 2019</xref>). In 2018, Tang et&#x20;al. (<xref ref-type="bibr" rid="B24">Hua et&#x20;al., 2018</xref>). developed a method based on support vector machines to identify HBPs, which uses the optimal characteristic coding protein obtained by using the optimized dipeptide composition. Subsequently, Basith et&#x20;al. developed the computational predictor iGHBP, which combined the dipeptide composition and the value of the amino acid index to obtain the optimal selection and predict the construction model (<xref ref-type="bibr" rid="B4">Basith et&#x20;al., 2018</xref>). In this paper, we constructed a prediction model, HBP_NB, to correctly identify HBPs. First, the k-mer (<xref ref-type="bibr" rid="B33">Liu et&#x20;al., 2008</xref>; <xref ref-type="bibr" rid="B10">Christopher et&#x20;al., 2013</xref>; <xref ref-type="bibr" rid="B30">Liu et&#x20;al., 2015a</xref>; <xref ref-type="bibr" rid="B38">Manavalan et&#x20;al., 2019</xref>) method was used to obtain the frequency characteristics of protein sequences, and then the F-score value method was used to select the feature subset. Finally, input the obtained features into Naive Bayes (<xref ref-type="bibr" rid="B17">Gong and Tian, 2010</xref>; <xref ref-type="bibr" rid="B20">He et&#x20;al., 2010</xref>; <xref ref-type="bibr" rid="B18">Gumus et&#x20;al., 2014</xref>; <xref ref-type="bibr" rid="B23">Hu et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B21">Hu et&#x20;al., 2021a</xref>; <xref ref-type="bibr" rid="B22">Hu et&#x20;al., 2021b</xref>) to construct the prediction&#x20;model.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>Materials and Methods</title>
<sec id="s2-1">
<title>Main Process of the Article</title>
<p>Machine learning frameworks have been used to identify multiple protein types, such as DNA binding proteins (<xref ref-type="bibr" rid="B63">Zeng et&#x20;al., 2015</xref>; <xref ref-type="bibr" rid="B43">Qu et&#x20;al., 2017</xref>; <xref ref-type="bibr" rid="B48">Shen and Zou, 2020</xref>), RNA binding proteins (<xref ref-type="bibr" rid="B60">Xiao et&#x20;al., 2017</xref>; <xref ref-type="bibr" rid="B28">Lei et&#x20;al., 2021</xref>), lncRNA interacting proteins (<xref ref-type="bibr" rid="B66">Zhang et&#x20;al., 2017</xref>; <xref ref-type="bibr" rid="B36">Liu, 2020</xref>), and drug targets (<xref ref-type="bibr" rid="B61">Yan et&#x20;al., 2016</xref>; <xref ref-type="bibr" rid="B54">Wang et&#x20;al., 2020b</xref>; <xref ref-type="bibr" rid="B55">Wang et&#x20;al., 2020c</xref>). Since the recognition of protein sequences includes two important steps sequence feature extraction and classifier selection the effective combination of feature extraction algorithms and classifiers has also been extensively studied (<xref ref-type="bibr" rid="B65">Zhang et&#x20;al., 2016</xref>). In this paper, we propose a predictive model for identifying hormone-binding proteins based on Na&#xef;ve Bayes.</p>
<p>HBPs prediction analysis was carried out through the following five steps: 1) HBPs and non-HBPs were searched and downloaded from UniProt, and the similarity threshold of protein sequences was set by the CD-HIT program to construct a high-quality dataset (<xref ref-type="bibr" rid="B70">Zou et&#x20;al., 2020</xref>); 2) feature extraction of protein sequences was performed using the k-mer feature coding method; 3) the extracted features were selected to improve the accuracy of classification; 4) different classification methods were used to classify and predict the selected feature subset and select the best classification methods; and 5) Performance evaluation. <xref ref-type="fig" rid="F1">Figure&#x20;1</xref> shows the structural framework for identifying HBPs in this paper. This section will introduce dataset establishment, feature selection methods and classification methods in detail.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Structure flow chart. The first step is to search and download HBPs and non-HBPs from the protein resource database and then use CD-HIT to perform protein de-redundancy operations. The threshold is set to 60%. Finally, protein sequences containing unknown residues are removed to generate the final protein dataset. The second step is to extract features of the protein, and the third step is to use different classification methods to classify the selected features.</p>
</caption>
<graphic xlink:href="fgene-12-797641-g001.tif"/>
</fig>
</sec>
<sec id="s2-2">
<title>Dataset</title>
<p>It is necessary to collect sufficient correlation function data as the basis of statistical model prediction. Therefore, it is first necessary to construct an objective dataset to ensure the effectiveness and robustness of the model. Therefore, we adopt the benchmark dataset constructed by Tang et&#x20;al. (<xref ref-type="bibr" rid="B53">Tang et&#x20;al., 2018</xref>). To build this dataset, follow these steps. The first step was to search and collect HBPs from UniProt (<xref ref-type="bibr" rid="B3">Bairoch et&#x20;al., 2009</xref>; <xref ref-type="bibr" rid="B47">Schneider, 2012</xref>) and to generate the original HBPs dataset by selecting the hormone binding keywords in the molecular function items of the gene body (<xref ref-type="bibr" rid="B2">Ashburner et&#x20;al., 2000</xref>). Consequently, 357 HBPs with manual annotation and review were selected. In the second step, to avoid the high similarity of protein sequences affecting the results, we used the CD-HIT (<xref ref-type="bibr" rid="B16">Li and Godzik, 2006</xref>; <xref ref-type="bibr" rid="B15">Fu et&#x20;al., 2012</xref>) program to set the truncation threshold to 0.6 to remove highly similar HBPs sequences. In the third step, when the protein sequence in the dataset contains unknown residues (such as &#x201c;X,&#x201d; &#x201c;Z,&#x201d; and &#x201c;B&#x201d;), it will affect the model prediction results, so protein sequences containing unknown residues need to be excluded. After the above steps, a total of 122 HBPs were obtained, which were regarded as positive data. As a control, 121&#x20;non-HBPs were randomly selected from UniProt as negative data using a similar selection strategy. The data of the model can be freely download from <ext-link ext-link-type="uri" xlink:href="https://github.com/GUOYUXINXIN/">https://github.com/GUOYUXINXIN/</ext-link>-. The benchmark dataset can be expressed as:<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mi>p</mml:mi>
</mml:msub>
<mml:mo>&#x222a;</mml:mo>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>Among them, subset <inline-formula id="inf1">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mi>p</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>contains 122 HBPs, and subset <inline-formula id="inf2">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>contains 121&#x20;non-HBPs.</p>
</sec>
<sec id="s2-3">
<title>Feature Extraction</title>
<p>Protein sequence is a string generated by the permutation and combination of 20 English letters with different lengths. Currently, general machine learning algorithms can only deal with feature vectors, so when machine learning methods are used, protein sequences need to be transformed into numerical vectors representing the characteristics of protein sequences. As the first step in building a biological sequence analysis model, feature extraction is an important part of correctly predicting protein sequences, an efficient feature extraction method can obtain a high performance classification model. The extracted features should not only retain the protein sequence information to the maximum extent, but also have a greater correlation with protein classification. Given a protein sequence, express it as:<disp-formula id="e2">
<mml:math id="m4">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo>&#x22ef;</mml:mo>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <inline-formula id="inf3">
<mml:math id="m5">
<mml:mi>P</mml:mi>
</mml:math>
</inline-formula>stands for protein sequence, <inline-formula id="inf4">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>represents the<inline-formula id="inf5">
<mml:math id="m7">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>amino acid residue of protein<inline-formula id="inf6">
<mml:math id="m8">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1,2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<sec id="s2-3-1">
<title>K-Mer</title>
<p>K-mer (<xref ref-type="bibr" rid="B31">Liu et&#x20;al., 2015b</xref>; <xref ref-type="bibr" rid="B41">Niu et&#x20;al., 2021b</xref>) is the most basic method of expressing protein sequences as digital vectors (<xref ref-type="bibr" rid="B32">Liu et&#x20;al., 2016</xref>), in which k-mer frequency coding refers to the occurrence frequency of all possible nucleotide sequences with k length in a given sequence (<xref ref-type="bibr" rid="B34">Liu et&#x20;al., 2015c</xref>; <xref ref-type="bibr" rid="B5">Bin et&#x20;al., 2017</xref>). The k-mer feature extraction algorithm is used to convert the protein sequence into a vector with a fixed length, which is used as the input vector of the machine learning classifier. For example, setting k to 2 produces a 400-dimensional vector <inline-formula id="inf7">
<mml:math id="m9">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>D</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
<mml:mi>Y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. To avoid the problem of overfitting, we generally set<inline-formula id="inf8">
<mml:math id="m10">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> because when<inline-formula id="inf9">
<mml:math id="m11">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> , more dimensions will be generated, resulting in dimension disaster (<xref ref-type="bibr" rid="B58">Wei et&#x20;al., 2019</xref>). Therefore, we set k to 3 so that the input protein sequence could be converted into a vector with 8,000 dimensions of fixed length.</p>
</sec>
<sec id="s2-3-2">
<title>Distance-Based Residual</title>
<p>DR (<xref ref-type="bibr" rid="B35">Liu et&#x20;al., 2014</xref>) is a feature expression method based on protein sequences that uses the distance between residue pairs to represent the feature vector of the protein. The feature vector is expressed by calculating the number of occurrences of residual pairs within a certain distance threshold. The feature vector dimension obtained by the DR feature extraction method is <inline-formula id="inf10">
<mml:math id="m12">
<mml:mrow>
<mml:mn>20</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>20</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>20</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>dimensions, where in 20 in <inline-formula id="inf11">
<mml:math id="m13">
<mml:mrow>
<mml:mn>20</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>20</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>20</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>represents the types of amino acids that make up the protein; <inline-formula id="inf12">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>X</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>is a distance threshold that can be set manually, which represents the maximum distance between pairs of amino acid residues.</p>
</sec>
<sec id="s2-3-3">
<title>Profile-Based Cross-Covariance</title>
<p>Since machine learning-based technologies such as random forest (RF) and logistic regression (LR) require the input of fixed-length vectors as input vectors for training, it is necessary to convert protein sequences of different lengths into fixed length vectors as input vector machine learning. Because each residue in a protein has many physical and chemical properties, protein sequences can be regarded as time series with similar properties. Therefore, CC-PSSM (<xref ref-type="bibr" rid="B13">Dong et&#x20;al., 2009</xref>) is used in this article to convert protein sequences of different lengths into fixed length vectors. PSSM algorithm is a common algorithm in the field of bioinformatics, known as the &#x201c;position-specific scoring matrix,&#x201d; which can store the evolutionary information of protein sequences so that it can be used for protein prediction. It is a matrix that calculates the percentage of different residues at each position in a multi sequence alignment, the matrix size is <inline-formula id="inf13">
<mml:math id="m15">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>20</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> (<inline-formula id="inf14">
<mml:math id="m16">
<mml:mi>L</mml:mi>
</mml:math>
</inline-formula> for protein sequence length). Among them, CC is a measure of correlation between two different properties of amino acid residues and can be calculated using the following equation:<disp-formula id="e3">
<mml:math id="m17">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>S</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>S</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>where<inline-formula id="inf15">
<mml:math id="m18">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>represents amino acids, and <inline-formula id="inf16">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>S</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>S</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the average score of <inline-formula id="inf17">
<mml:math id="m20">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>along the protein sequence. <inline-formula id="inf18">
<mml:math id="m21">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the maximum lag, <inline-formula id="inf19">
<mml:math id="m22">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is an integer value from 1 to <inline-formula id="inf20">
<mml:math id="m23">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and the total number of CC variables is <inline-formula id="inf21">
<mml:math id="m24">
<mml:mrow>
<mml:mn>380</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. In this paper, we set the value of<inline-formula id="inf22">
<mml:math id="m25">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to 2 to obtain a <inline-formula id="inf23">
<mml:math id="m26">
<mml:mrow>
<mml:mn>720</mml:mn>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>380</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>-dimensional vector.</p>
</sec>
</sec>
<sec id="s2-4">
<title>Feature Selection</title>
<p>When the feature size is large, there may be irrelevant features or inter-dependence between features, which will easily affect the accuracy of the prediction results. In particular, the more feature dimensions, the more likely it is to lead to &#x201c;dimension disaster,&#x201d; model complexity and model generalization ability decline. Therefore, removing irrelevant or redundant features through feature selection can improve the accuracy of classification performance and reduce the running time of the model (<xref ref-type="bibr" rid="B42">Polat and G&#xfc;ne&#x15f;, 2009</xref>; <xref ref-type="bibr" rid="B45">Quan et&#x20;al., 2016</xref>; <xref ref-type="bibr" rid="B69">Zou et&#x20;al., 2016</xref>; <xref ref-type="bibr" rid="B25">Guohua and Jincheng, 2018</xref>; <xref ref-type="bibr" rid="B59">Wei et&#x20;al., 2018</xref>; <xref ref-type="bibr" rid="B46">Riaz and Li, 2019</xref>; <xref ref-type="bibr" rid="B19">He et&#x20;al., 2020</xref>). In this paper, the F-score value is used to select the optimal feature (<xref ref-type="bibr" rid="B6">Chen and Lin, 2008</xref>; <xref ref-type="bibr" rid="B9">Cheng et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B58">Wei et&#x20;al., 2019</xref>), which is a method to measure the distinguishing ability of features between the two categories, and the most effective feature selection can be achieved through this method. Therefore, we can use (<xref ref-type="disp-formula" rid="e4">Eq. 4</xref>) to describe the contribution of each feature and perform feature selection:<disp-formula id="e4">
<mml:math id="m27">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>s</mml:mi>
<mml:mi>b</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mi>s</mml:mi>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where<inline-formula id="inf24">
<mml:math id="m28">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the score of the<inline-formula id="inf25">
<mml:math id="m29">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> feature of the F-score. Generally, the larger the value of <inline-formula id="inf26">
<mml:math id="m30">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is, the stronger the ability to recognize samples.<inline-formula id="inf27">
<mml:math id="m31">
<mml:mrow>
<mml:msubsup>
<mml:mi>s</mml:mi>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the intragroup variance, and<inline-formula id="inf28">
<mml:math id="m32">
<mml:mrow>
<mml:msubsup>
<mml:mi>s</mml:mi>
<mml:mi>b</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the intergroup variance. Their calculation formula is as follows:<disp-formula id="e5">
<mml:math id="m33">
<mml:mrow>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:msubsup>
<mml:mi>s</mml:mi>
<mml:mi>b</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:msubsup>
<mml:mi>s</mml:mi>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>where<inline-formula id="inf29">
<mml:math id="m34">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>b</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>is the sum of squares between groups; <inline-formula id="inf30">
<mml:math id="m35">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>w</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>is the sum of squares within the group; <inline-formula id="inf31">
<mml:math id="m36">
<mml:mi>K</mml:mi>
</mml:math>
</inline-formula>is the total number of classes; and<inline-formula id="inf32">
<mml:math id="m37">
<mml:mi>N</mml:mi>
</mml:math>
</inline-formula>is the total number of samples.</p>
</sec>
<sec id="s2-5">
<title>Classifier</title>
<p>In this paper, Naive Bayes, Random forests, logistic regression, linear discriminant and other classification algorithms are used to predict&#x20;HBPs.</p>
<sec id="s2-5-1">
<title>Na&#xef;ve Bayes</title>
<p>The Naive Bayes method is a classification method based on Bayes&#x2019; theorem and the assumption of the independence of characteristic conditions. It is characterized by combining prior probability and posterior probability and a very widely used algorithm. The main idea of the naive Bayes classifier is to solve the posterior probability <inline-formula id="inf33">
<mml:math id="m38">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> through joint probability modeling and use Bayes&#x2019; theorem. Then, the category corresponding to the largest posterior probability is used as the predicted category. Suppose there is a sample dataset <inline-formula id="inf34">
<mml:math id="m39">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, the feature dataset corresponding to the sample dataset is <inline-formula id="inf35">
<mml:math id="m40">
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, features are independent and random, and the class variable is <inline-formula id="inf36">
<mml:math id="m41">
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. According to the Naive Bayes algorithm, the posterior probability of the sample belonging to category<inline-formula id="inf37">
<mml:math id="m42">
<mml:mi>y</mml:mi>
</mml:math>
</inline-formula>can be expressed as:<disp-formula id="e6">
<mml:math id="m43">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>Where<inline-formula id="inf38">
<mml:math id="m44">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>is the prior probability, Naive Bayes is based on the independence of each feature. In the case of a given category, <xref ref-type="disp-formula" rid="e6">Eq. 6</xref> can be further expressed as the following equation:<disp-formula id="e7">
<mml:math id="m45">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>X</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x220f;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>d</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>The posterior probability can be calculated from the above two <xref ref-type="disp-formula" rid="e6">Eqs 6</xref>, <xref ref-type="disp-formula" rid="e7">7</xref>:<disp-formula id="e8">
<mml:math id="m46">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x220f;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>d</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>Since the magnitude of <inline-formula id="inf39">
<mml:math id="m47">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>X</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>is fixed, when comparing the posterior probability, only the molecular part of the above equation can be compared. Therefore, a naive Bayesian calculation of sample data belonging to category <inline-formula id="inf40">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> can be obtained:<disp-formula id="e9">
<mml:math id="m49">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x220f;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>d</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x220f;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>d</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mstyle>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-5-2">
<title>Random Forests</title>
<p>RF is a flexible, easy-to-use machine learning algorithm that contains multiple decision trees. It is an optimized version of bagging (<xref ref-type="bibr" rid="B51">Su et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B64">Zeng et&#x20;al., 2020</xref>). The idea of bagging is to vote on the results of multiple weak classifiers to combine them into a strong classifier, thereby improving the prediction accuracy of the model. In the training phase, RF uses the bootstrap sampling method to collect multiple different subsets from the input training dataset and then uses the different collected subsets to train the internal decision tree. Then, in the prediction phase, RF votes for the prediction results of multiple internal decision trees and then outputs the prediction results. Its advantages are as follows: 1) it can process high-dimensional data without feature selection; 2) accuracy can be maintained even if many of the features are missing; and 3) it has a fast training speed (<xref ref-type="bibr" rid="B27">Jiao et&#x20;al., 2021</xref>).</p>
</sec>
<sec id="s2-5-3">
<title>Logistic Regression</title>
<p>As a classification model, LR can deal with the 0/1 classification problem because of the nonlinear factor introduced by the sigmoid function. The image of the logical function is an S-shaped curve with values between (0, 1). The farther away from 0 a function is, the closer to 0 or 1 the value of the function will be. Therefore, this feature can be used to solve the problem of binary classification. The function formula is as follows:<disp-formula id="e10">
<mml:math id="m50">
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>z</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>z</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
<p>Among them, <inline-formula id="inf41">
<mml:math id="m51">
<mml:mrow>
<mml:mi>z</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mi>&#x3b8;</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
<mml:mi>x</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>n</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b8;</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
</mml:mrow>
</mml:mstyle>
<mml:msub>
<mml:mi>&#x3b8;</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b8;</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b8;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b8;</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>; therefore, the predictive function of logistic regression can be expressed as:<disp-formula id="e11">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>&#x3b8;</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mi>&#x3b8;</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mi>&#x3b8;</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-5-4">
<title>Linear Discriminant Analysis</title>
<p>LDA is a classical linear learning method, also known as &#x201c;Fisher&#x201d; discriminant analysis in dichotomies. Unlike the perception machine, the principle of LDA is dimension reduction. In other words, given a set of training samples, the article tries to sample projections to a straight line, keeping the points with the same classification as close as possible and the classification of different points as far apart as possible, i.e.,&#x20;maximizing and minimizing the variance between variance. LDA can, therefore, make use of sample points in the projection line (or projection location) to determine the type of sample.</p>
</sec>
</sec>
<sec id="s2-6">
<title>Performance Evaluation</title>
<p>In this article, we use the specificity (SP), sensitivity (SN), accuracy (ACC) (<xref ref-type="bibr" rid="B62">Yang et&#x20;al., 2021</xref>) and Matthews correlation coefficient (MCC) to evaluate our proposed method (<xref ref-type="bibr" rid="B49">Snow et&#x20;al., 2005</xref>; <xref ref-type="bibr" rid="B7">Cheng et&#x20;al., 2018</xref>), which can be expressed&#x20;as:</p>
<p>1. Accuracy: ACC represents the probability that all positive and negative samples will be correctly predicted.<disp-formula id="e12">
<mml:math id="m53">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<p>2. Sensitivity: SN represents the probability that the actual hormone-binding protein is predicted to be a hormone-binding protein.<disp-formula id="e13">
<mml:math id="m54">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>
</p>
<p>3. Specificity: SP represents the probability that a non-hormone-binding protein is predicted to be a non-hormone-binding protein.<disp-formula id="e14">
<mml:math id="m55">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>
</p>
<p>4. MCC: MCC represents the reliability of the algorithm results.<disp-formula id="e15">
<mml:math id="m56">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>
</p>
<p>5 Precision: Indicates how many of the samples predicted to be positive are true positive samples.<disp-formula id="e16">
<mml:math id="m57">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>
</p>
<p>6. F1-Score: The F1 score is balanced by taking into account both accuracy and recall, so that both are maximized at the same time.<disp-formula id="e17">
<mml:math id="m58">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>Where, the recall rate is: <inline-formula id="inf42">
<mml:math id="m59">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
<p>7. The ROC curve: Receiver operating characteristic curve (the area under the curve is AUROC), <italic>X</italic>-axis is false positive rate (FPR), <italic>Y</italic>-axis is true positive rate (TPR):<disp-formula id="e18">
<mml:math id="m60">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>
<disp-formula id="e19">
<mml:math id="m61">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>
</p>
<p>8. PRC: PRC takes precision rate as <italic>Y</italic>-axis and recall rate as <italic>X</italic>-axis.</p>
<p>Where <inline-formula id="inf43">
<mml:math id="m62">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>refers to the model correctly predicting positive category samples; <inline-formula id="inf44">
<mml:math id="m63">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>refers to the model incorrectly predicting negative category samples as positive category; <inline-formula id="inf45">
<mml:math id="m64">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> refers to the model correctly predicting negative category samples; and <inline-formula id="inf46">
<mml:math id="m65">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>refers to the model incorrectly predicting positive category samples as negative category (<xref ref-type="bibr" rid="B11">Ding et&#x20;al., 2020a</xref>; <xref ref-type="bibr" rid="B12">Ding et&#x20;al., 2020b</xref>).</p>
<p>In machine learning, a test set is needed to test the model and describe its generalization ability. However, in practical applications, due to the limited number of datasets, cross validation is used as a test method. There are three types of cross validation: K-fold cross validation, fold cross validation and independent data verification. In this article, we use K-fold cross-validation to test the constructed model. K-fold cross-validation divides the training data into K parts, of which (K-1) pieces of data are used to train the model, and the remaining 1 piece of data is used to evaluate the quality of the model. This process is cycled K times, and the K evaluation results obtained are combined, such as averaging or voting. The flow chart of K-fold cross verification is shown in <xref ref-type="fig" rid="F2">Figure&#x20;2</xref>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>K-fold cross-validation diagram. Divide the data into K parts, where k-1 parts are used as the training dataset, and the remaining part is used as the test set. The mean value of the results of the k groups is calculated as the performance index of the current k-fold cross-validation evaluation&#x20;model.</p>
</caption>
<graphic xlink:href="fgene-12-797641-g002.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="results|discussion" id="s3">
<title>Results and Discussion</title>
<p>In machine learning, the predicted results of the model can be tested through cross-validation. In this article, we use 10-fold cross-validation to evaluate the built&#x20;model.</p>
<sec id="s3-1">
<title>Performance Comparison of Different Feature Expression Methods</title>
<p>According to the feature extraction part, protein sequences are transformed into feature vectors of different sizes through different feature extraction methods. Therefore, in this study we tested the performance of three feature extraction methods: k-mer (K &#x3d; 2), k-mer (K &#x3d; 3), DR and CC-PSSM.</p>
<p>First, use the F-score feature selection method to reduce the dimensionality of the feature vectors obtained by different feature extraction methods to 250 dimensions, then use the selected best feature vector as the input vector of the naive Bayes algorithm and perform 10-fold cross-validation, and finally draw forecast results. The prediction results are shown in <xref ref-type="table" rid="T1">Table&#x20;1</xref> (the maximum value is in bold). As shown in <xref ref-type="table" rid="T1">Table&#x20;1</xref>, the k-mer (k &#x3d; 3) feature extraction algorithm used in this model performs best in all indicators, among which the values of ACC, MCC, SP and SN are, respectively, 95.45,91.36, 96.73, and 94.17%. These results prove the validity of our&#x20;model.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Prediction results of different feature extraction algorithms based on the Bayesian classifier.</p>
</caption>
<table>
<thead>
<tr>
<td align="left">Feature extraction</td>
<td align="center">SN(%)</td>
<td align="center">SP(%)</td>
<td align="center">ACC(%)</td>
<td align="center">MCC(%)</td>
<td align="center">AUROC(%)</td>
<td align="center">PRC(%)</td>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">K-mer(k &#x3d; 3)</td>
<td align="char" char=".">
<bold>94.17</bold>
</td>
<td align="char" char=".">
<bold>96.73</bold>
</td>
<td align="char" char=".">
<bold>95.45</bold>
</td>
<td align="char" char=".">
<bold>91.36</bold>
</td>
<td align="char" char=".">
<bold>95.17</bold>
</td>
<td align="char" char=".">
<bold>96.55</bold>
</td>
</tr>
<tr>
<td align="left">K-mer(k &#x3d; 2)</td>
<td align="char" char=".">65.51</td>
<td align="char" char=".">78.46</td>
<td align="char" char=".">71.96</td>
<td align="char" char=".">44.50</td>
<td align="char" char=".">77.89</td>
<td align="char" char=".">76.97</td>
</tr>
<tr>
<td align="left">DR</td>
<td align="char" char=".">83.46</td>
<td align="char" char=".">37.12</td>
<td align="char" char=".">60.39</td>
<td align="char" char=".">25.64</td>
<td align="char" char=".">66.35</td>
<td align="char" char=".">75.99</td>
</tr>
<tr>
<td align="left">CC-PSSM</td>
<td align="char" char=".">64.10</td>
<td align="char" char=".">80.13</td>
<td align="char" char=".">72.09</td>
<td align="char" char=".">45.29</td>
<td align="char" char=".">78.24</td>
<td align="char" char=".">80.27</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec>
<title>Comparison With Other Classifiers </title>
<p>To show the superiority of naive Bayes in HBPs recognition, we can compare the HBPs recognition performance of different classification algorithms based on the same feature subset (i.e. 250 optimal features). In this paper, we used the constructed HBP_NB model to perform performance comparison with RF, LDA, Logistic regression and other models under the condition of 10-fold cross-validation, and the comparison results are shown as follows. <xref ref-type="table" rid="T2">Table&#x20;2</xref> shows the specific values of different classification models under SN, SP, ACC, MCC and other indicators (the maximum values are in bold). As can be seen from <xref ref-type="table" rid="T2">Table&#x20;2</xref>, HBP_NB prediction model achieved better results than other classification algorithms in identifying hormone-binding proteins, in which ACC, MCC, SN and SP were 95.45, 91.36, 94.17 and 96.73%, respectively. <xref ref-type="fig" rid="F3">Figures 3</xref>, <xref ref-type="fig" rid="F4">4</xref> respectively show the boxplot diagram of different models, ROC and PRC curves schematic diagram. These results show that our model has good classification ability. Therefore, we construct the final model based on naive Bayes. Where, the line in the middle of the box in the boxplot is the median of the data, representing the average level of the sample data; The top of the box represents the upper quartile and the bottom quartile represents the lower quartile, which means the box contains 50% of the data, so the width of the box reflects, to some extent, how much the data fluctuates; at the same time, the lines above and below the box represent the maximum and minimum values of data. The ROC curve is a curve that evaluates the effect of binary model on positive category prediction. <italic>X</italic>-axis is false positive rate (FPR), <italic>Y</italic>-axis is true positive rate (TPR), which indicates that the optimal classifier with the best performance is located in the upper left corner of the image (coordinate 0,1), and the area under its ROC curve is AUROC, with an area value between 0,1. PRC takes presion rate as <italic>Y</italic>-axis and recall rate as <italic>X</italic>-axis, and lines are drawn according to changes in the value of probability threshold. The ideal model would be at the point (1,1). The model with excellent performance is as close to this point as possible.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Performance comparison of different classifiers under 10-fold cross validation</p>
</caption>
<table>
<thead>
<tr>
<td align="left">Classifier</td>
<td align="center">SN(%)</td>
<td align="center">SP(%)</td>
<td align="center">ACC(%)</td>
<td align="center">MCC(%)</td>
<td align="center">AUROC(%)</td>
<td align="center">PRC(%)</td>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">NB</td>
<td align="char" char=".">94.17</td>
<td align="char" char=".">
<bold>96.73</bold>
</td>
<td align="char" char=".">
<bold>95.45</bold>
</td>
<td align="char" char=".">
<bold>91.36</bold>
</td>
<td align="char" char=".">
<bold>95.17</bold>
</td>
<td align="char" char=".">
<bold>96.55</bold>
</td>
</tr>
<tr>
<td align="left">RF</td>
<td align="char" char=".">77.95</td>
<td align="char" char=".">87.57</td>
<td align="char" char=".">82.71</td>
<td align="char" char=".">66.26</td>
<td align="char" char=".">89.45</td>
<td align="char" char=".">91.19</td>
</tr>
<tr>
<td align="left">LDA</td>
<td align="char" char=".">72.24</td>
<td align="char" char=".">70.13</td>
<td align="char" char=".">71.20</td>
<td align="char" char=".">43.08</td>
<td align="char" char=".">94.53</td>
<td align="char" char=".">95.32</td>
</tr>
<tr>
<td align="left">LR</td>
<td align="char" char=".">
<bold>96.92</bold>
</td>
<td align="char" char=".">17.50</td>
<td align="char" char=".">57.00</td>
<td align="char" char=".">14.42</td>
<td align="char" char=".">76.35</td>
<td align="char" char=".">79.43</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Boxplot diagram of different classification models, this figure shows the distribution of LDA, LR, RF and NB under SN, SP, accuracy, ACC, MCC, F1-Score, AUROC and AUPRC successively from left to right and from top to bottom. At the same time, it can be seen from the figure that NB can achieve good results under different indicators.</p>
</caption>
<graphic xlink:href="fgene-12-797641-g003.tif"/>
</fig>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>As can be seen from the ROC curves and PRC curves of different classification models, the ROC curves of LDA, RF, LR and NB are 0.7635, 0.894 and 0.9453, respectively. The dotted line represents the ROC curve of a completely random classifier, and the ROC curve of a good classifier should be as far away from the dotted line as possible, as close as possible to the upper left corner; The PRC curve values of LDA, RF, LR and NB were 0.7943, 0.9071, 0.9532 and 0.9655, respectively, the closer the curve was to the upper right corner, the better the model classification ability was. Therefore, we constructed the final model based on NB.</p>
</caption>
<graphic xlink:href="fgene-12-797641-g004.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>Performance Comparison With the Existing Optimal Algorithm</title>
<p>This section compares the model constructed in the article with other existing methods, in which the results of HBPred (<xref ref-type="bibr" rid="B24">Hua et&#x20;al., 2018</xref>) and iGHBP (<xref ref-type="bibr" rid="B4">Basith et&#x20;al., 2018</xref>) are directly obtained from the literature. The comparison results are shown in <xref ref-type="table" rid="T3">Table&#x20;3</xref> (the maximum value is in bold). As seen from <xref ref-type="table" rid="T3">Table&#x20;3</xref>, the HBP_NB model constructed in this paper has the best performance in all indicators, among which ACC, SP and SN have reached maximum values of 95.45, 96.73 and 94.17%, respectively. The effect is significantly better than that of the other two methods, which also proves the effectiveness of the HBP_NB model constructed in this&#x20;paper.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Comparison of our method with other published methods.</p>
</caption>
<table>
<thead>
<tr>
<td align="left">Methods</td>
<td align="center">SN(%)</td>
<td align="center">SP(%)</td>
<td align="center">ACC(%)</td>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">HBPred <xref ref-type="bibr" rid="B24">Hua et&#x20;al. (2018)</xref>
</td>
<td align="char" char=".">80.43</td>
<td align="char" char=".">56.52</td>
<td align="char" char=".">68.48</td>
</tr>
<tr>
<td align="left">iGHBP <xref ref-type="bibr" rid="B4">Basith et&#x20;al. (2018)</xref>
</td>
<td align="char" char=".">86.96</td>
<td align="char" char=".">47.83</td>
<td align="char" char=".">67.39</td>
</tr>
<tr>
<td align="left">HBP_NB</td>
<td align="char" char=".">
<bold>94.17</bold>
</td>
<td align="char" char=".">
<bold>96.73</bold>
</td>
<td align="char" char=".">
<bold>95.45</bold>
</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec sec-type="conclusion" id="s4">
<title>Conclusion</title>
<p>As a carrier protein related to the regulation of hormones in the circulatory system, HBPs can cause various diseases when they are abnormally expressed. Therefore, it is very important to understand their function and regulatory mechanism, and the correct identification of HBPs is the first step in understanding their biological process and is necessary to further study their function. There is growing evidence that it is crucial to develop an efficient computational model to identify hormone-binding proteins. In this study, we used a reliable predictive model for HBP_NB to identify HBPs. First, the model uses the k-mer feature extraction method to extract the features of HBPs. Then, to remove redundancy and noise and improve the accuracy of model prediction, the F-score value is used to sort the features and select the optimal features. Secondly, the reduced feature set is input into naive Bayes classifier and the 10-fold cross validation is used to judge the quality of the prediction model. Finally, the accuracy, sensitivity and specificity of the HBP_NB model reached 95.45, 94.17 and 96.73%, respectively, in 10-fold cross validation. The feasibility and validity of our model are illustrated.</p>
<p>However, there is room for improvement in our current approach. Since the data set selected in this experiment is small, we will collect more data for model training and independent test set experiments in the future to improve the model&#x2019;s robustness and generalization ability. At the same time, we will further learn more effective feature representation methods and classification algorithms to gain an in-depth understanding of machine learning and establish a more stable model. In addition, we also hope that our work can help scholars to study hormone binding proteins, to promote research on hormone-binding protein&#x20;drugs.</p>
</sec>
</body>
<back>
<sec id="s5">
<title>Data Availability Statement</title>
<p>The original contributions presented in the study are included in the article/Supplementary Material, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec id="s6">
<title>Author Contributions</title>
<p>Conceptualization, WZ and PW; data collection or analysis, YG and PW; validation, YG and LH; writing&#x2014;original draft preparation, YG and LH; writing&#x2014;review and editing, YG and PW. All authors have read and agreed to the published version of the article.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>This work was supported by the National Nature Science Foundation of China (Grant Nos 61,863,010, 11926205, 11926412, and 61873076), National Key R&#x26;D Program of China (No.2020YFB2104400) and Natural Science Foundation of Hainan, China (Grant Nos. 119MS036 and 120RC588), and Hainan Normal University 2020 Graduate Student Innovation Research Project (hsyx 2020&#x2013;41), The Special Science Foundation of Quzhou (2020D003)</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ack>
<p>Thanks to the guidance of my tutor and the joint efforts of other authors, the success of this article is the result of everyone&#x2019;s joint efforts.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Akbar</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Khan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ali</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Hayat</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Qasim</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Gul</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>iHBP-DeepPSSM: Identifying Hormone Binding Proteins Using PsePSSM Based Evolutionary Features and Deep Learning Approach</article-title>. <source>Chemometrics Intell. Lab. Syst.</source> <volume>204</volume>, <fpage>104103</fpage>. <pub-id pub-id-type="doi">10.1016/j.chemolab.2020.104103</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ashburner</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ball</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>Blake</surname>
<given-names>J.&#x20;A.</given-names>
</name>
<name>
<surname>Botstein</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Butler</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Cherry</surname>
<given-names>J.&#x20;M.</given-names>
</name>
<etal/>
</person-group> (<year>2000</year>). <article-title>Gene Ontology: Tool for the Unification of Biology</article-title>. <source>Nat. Genet.</source> <volume>25</volume> (<issue>1</issue>), <fpage>25</fpage>&#x2013;<lpage>29</lpage>. <pub-id pub-id-type="doi">10.1038/75556</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bairoch</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bougueleret</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Altairac</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Amendolia</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>The Universal Protein Resource (UniProt) 2009</article-title>. <source>Nucleic Acids Res.</source> <volume>37</volume> (<issue>Suppl. 1</issue>), <fpage>D169</fpage>&#x2013;<lpage>D174</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkn664</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Basith</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Manavalan</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Shin</surname>
<given-names>T. H.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>iGHBP: Computational Identification of Growth Hormone Binding Proteins from Sequences Using Extremely Randomised Tree</article-title>. <source>Comput. Struct. Biotechnol. J.</source> <volume>16</volume>, <fpage>412</fpage>&#x2013;<lpage>420</lpage>. <pub-id pub-id-type="doi">10.1016/j.csbj.2018.10.007</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bin</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Hao</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Kuo-Chen</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Pse-in-One 2.0: An Improved Package of Web Servers for Generating Various Modes of Pseudo Components of DNA, RNA, and Protein Sequences</article-title>. <source>Nat. Sci.</source> <volume>9</volume> (<issue>4</issue>), <fpage>67</fpage>&#x2013;<lpage>91</lpage>. <pub-id pub-id-type="doi">10.4236/ns.2017.94007</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>Y. W.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>C. J.</given-names>
</name>
</person-group> (<year>2008</year>). <source>Combining SVMs with Various Feature Selection Strategies Feature Extraction</source>. <publisher-loc>Taipei, Taiwan</publisher-loc>: <publisher-name>Studies in Fuzziness and Soft Computing</publisher-name>. </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>DincRNA: a Comprehensive Web-Based Bioinformatics Toolkit for Exploring Disease Associations and ncRNA Function</article-title>. <source>Bioinformatics</source> <volume>34</volume> (<issue>11</issue>), <fpage>1953</fpage>&#x2013;<lpage>1956</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bty002</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>IntNetLncSim: an Integrative Network Analysis Method to Infer Human lncRNA Functional Similarity</article-title>. <source>Oncotarget</source> <volume>7</volume> (<issue>30</issue>), <fpage>47864</fpage>&#x2013;<lpage>47874</lpage>. <pub-id pub-id-type="doi">10.18632/oncotarget.10012</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Pei</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>MetSigDis: a Manually Curated Resource for the Metabolic Signatures of Diseases</article-title>. <source>Brief Bioinform</source> <volume>20</volume> (<issue>1</issue>), <fpage>203</fpage>&#x2013;<lpage>209</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbx103</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Christopher</surname>
<given-names>F. B.</given-names>
</name>
<name>
<surname>Dongwon</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Mccallion</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Beer</surname>
<given-names>M. A.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Kmer-SVM: a Web Server for Identifying Predictive Regulatory Sequence Features in Genomic Data Sets</article-title>. <source>Nucleic Acids Res.</source> <volume>W1</volume>, <fpage>W544</fpage>&#x2013;<lpage>W556</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkt519</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Identification of Drug-Target Interactions via Dual Laplacian Regularized Least Squares with Multiple Kernel Fusion</article-title>. <source>Knowledge-Based Syst.</source> <volume>204</volume>, <fpage>106254</fpage>. <pub-id pub-id-type="doi">10.1016/j.knosys.2020.106254</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Identification of Drug&#x2013;Target Interactions via Fuzzy Bipartite Local Model</article-title>. <source>Neural Comput. Appl.</source> <volume>32</volume> (<issue>D1</issue>), <fpage>1</fpage>&#x2013;<lpage>17</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-019-04569-z</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Guan</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>A New Taxonomy-Based Protein Fold Recognition Approach Based on Autocross-Covariance Transformation</article-title>. <source>Bioinformatics</source> <volume>25</volume> (<issue>20</issue>), <fpage>2655</fpage>&#x2013;<lpage>2662</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btp500</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Einarsd&#xf3;ttir</surname>
<given-names>I. E.</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Jnsson</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Sundh</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Hasselberg-Frank</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Nilsen</surname>
<given-names>T. O.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Plasma Growth Hormone-Binding Protein Levels in Atlantic salmonSalmo Salarduring Smoltification and Seawater Transfer</article-title>. <source>J.&#x20;Fish Biol.</source> <volume>85</volume> (<issue>4</issue>), <fpage>1279</fpage>&#x2013;<lpage>1296</lpage>. <pub-id pub-id-type="doi">10.1111/jfb.12473</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Circular RNAs Serve as Novel Biomarkers and Therapeutic Targets in Cancers</article-title>. <source>Cgt</source> <volume>19</volume> (<issue>2</issue>), <fpage>125</fpage>&#x2013;<lpage>133</lpage>. <pub-id pub-id-type="doi">10.2174/1566523218666181109142756</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Niu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>CD-HIT: Accelerated for Clustering the Next-Generation Sequencing Data</article-title>. <source>Bioinformatics</source> <volume>28</volume> (<issue>23</issue>), <fpage>3150</fpage>&#x2013;<lpage>3152</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bts565</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Gong</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Chinese Web Text Classification System Model Based on Naive Bayes</article-title>,&#x201d; in <conf-name>International Conference on E-product E-service &#x26; E-entertainment</conf-name>, <conf-loc>Henan, China</conf-loc>, <conf-date>7-9 Nov. 2010</conf-date>. </citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Gumus</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Sakar</surname>
<given-names>C. O.</given-names>
</name>
<name>
<surname>Erdem</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Kursun</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Online Naive Bayes Classification for Network Intrusion Detection</article-title>,&#x201d; in <conf-name>IEEE/ACM International Conference on Advances in Social Networks Analysis &#x26; Mining</conf-name>, <conf-loc>Beijing, China</conf-loc>, <conf-date>17-20 Aug. 2014</conf-date>. </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guohua</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Jincheng</surname>
<given-names>L</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Feature Extractions for Computationally Predicting Protein Post-Translational Modifications</article-title>. <source>Curr. Bioinformatics</source> <volume>12</volume> (<issue>4</issue>), <fpage>387</fpage>&#x2013;<lpage>395</lpage>. <pub-id pub-id-type="doi">10.2174/1574893612666170707094916</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>MRMD2.0: A Python Tool for Machine Learning with Feature Ranking and Reduction</article-title>. <source>Curr. Bioinformatics</source> <volume>15</volume> (<issue>10</issue>), <fpage>1213</fpage>&#x2013;<lpage>1221</lpage>. </citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Dan</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Research of P2P Traffic Identification Based on Naive Bayes and Decision Tables Combination Algorithm</article-title>,&#x201d; in <conf-name>Seventh International Conference on Fuzzy Systems &#x26; Knowledge Discovery</conf-name>, <conf-loc>Yantai, China</conf-loc>, <conf-date>10-12 Aug. 2010</conf-date>. </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Qiu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Integration of Multiple-Omics Data to Analyze the Population-specific Differences for Coronary Artery Disease</article-title>. <source>Comput. Math. Methods Med.</source> <volume>2021</volume>, <fpage>7036592</fpage>. <pub-id pub-id-type="doi">10.1155/2021/7036592</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.&#x20;Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Variant Associates with Alzheimer&#x27;s Disease and Regulates TMEM106B Expression in Human Brain Tissues</article-title>. <source>BMC Med.</source> <volume>19</volume> (<issue>1</issue>), <fpage>11</fpage>. <pub-id pub-id-type="doi">10.1186/s12916-020-01883-5</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>rs34331204 Regulates TSPAN13 Expression and Contributes to Alzheimer&#x27;s Disease with Sex Differences</article-title>. <source>Brain</source> <volume>143</volume> (<issue>11</issue>), <fpage>e95</fpage>. <pub-id pub-id-type="doi">10.1093/brain/awaa302</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hua</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Y. W.</given-names>
</name>
<name>
<surname>Ping</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C. M.</given-names>
</name>
<name>
<surname>Rong</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>HBPred: a Tool to Identify Growth Hormone-Binding Proteins</article-title>. <source>Int. J.&#x20;Biol.</source> <volume>14</volume> (<issue>8</issue>), <fpage>957</fpage>&#x2013;<lpage>964</lpage>. <pub-id pub-id-type="doi">10.7150/ijbs.24174</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiao</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>iTTCA-RF: a Random forest Predictor for Tumor T&#x20;Cell Antigens</article-title>. <source>J.&#x20;Transl Med.</source> <volume>19</volume> (<issue>1</issue>), <fpage>449</fpage>. <pub-id pub-id-type="doi">10.1186/s12967-021-03084-x</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lei</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Shanshan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Quan</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>An In Silico Approach to Identification, Categorization and Prediction of Nucleic Acid Binding Proteins</article-title>. <source>Brief. Bioinform.</source> <volume>22</volume> (<issue>3</issue>), <fpage>bbaa171</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbaa171</pub-id> </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Development and Application of Artificial Intelligence Methods in Biological and Medical Data</article-title>. <source>Cbio</source> <volume>15</volume> (<issue>6</issue>), <fpage>515</fpage>&#x2013;<lpage>516</lpage>. <pub-id pub-id-type="doi">10.2174/157489361506200610112345</pub-id> </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chou</surname>
<given-names>K. C.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Identification of microRNA Precursor with the Degenerate K-Tuple or Kmer Strategy</article-title>. <source>J.&#x20;Theor. Biol.</source> <volume>385</volume>, <fpage>153</fpage>&#x2013;<lpage>159</lpage>. <pub-id pub-id-type="doi">10.1016/j.jtbi.2015.08.025</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Chou</surname>
<given-names>K. C.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Pse-in-One: a Web Server for Generating Various Modes of Pseudo Components of DNA, RNA, and Protein Sequences</article-title>. <source>Nucleic Acids Res.</source> <volume>W1</volume>, <fpage>W65</fpage>&#x2013;<lpage>W71</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkv458</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Long</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Chou</surname>
<given-names>K. C.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>iDHS-EL: Identifying DNase I Hypersensitive Sites by Fusing Three Different Modes of Pseudo Nucleotide Composition into an Ensemble Learning Framework</article-title>. <source>Bioinformatics</source> <volume>32</volume> (<issue>16</issue>), <fpage>2411</fpage>&#x2013;<lpage>2418</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btw186</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>A Discriminative Method for Protein Remote Homology Detection and Fold Recognition Combining Top-N-Grams and Latent Semantic Analysis</article-title>. <source>BMC Bioinformatics</source> <volume>9</volume>, <fpage>510</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-9-510</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Chou</surname>
<given-names>K.-C.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Pse-in-One: a Web Server for Generating Various Modes of Pseudo Components of DNA, RNA, and Protein Sequences</article-title>. <source>Nucleic Acids Res.</source> <volume>43</volume>, <fpage>W65</fpage>&#x2013;<lpage>W71</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkv458</pub-id> </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Using Distances between Top-N-Gram and Residue Pairs for Protein Remote Homology Detection</article-title>. <source>Bmc Bioinformatics</source> <volume>15</volume> (<issue>S2</issue>), <fpage>S3</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-15-s2-s3</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Godzik</surname>
<given-names>A</given-names>
</name>
</person-group>(<year>2006</year>). <article-title>Cd-hit A Fast Program for Clustering and Comparing Large Sets of Protein or Nucleotide Sequences</article-title>. <source>Bioinformatics</source> <volume>22</volume> (<issue>13</issue>), <fpage>1658</fpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btl158</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Z.-P.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Predicting lncRNA-Protein Interactions by Machine Learning Methods: A Review</article-title>. <source>Curr. Bioinformatics</source> <volume>15</volume> (<issue>8</issue>), <fpage>831</fpage>&#x2013;<lpage>840</lpage>. </citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>B</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>BioSeq-Analysis: a Platform for DNA, RNA and Protein Sequence Analysis Based on Machine Learning Approaches</article-title>. <source>Brief. Bioinform.</source> <volume>20</volume> (<issue>4</issue>), <fpage>4</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbx165</pub-id> </citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Manavalan</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Basith</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Shin</surname>
<given-names>T. H.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>D. Y.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>4mCpred-EL: An Ensemble Learning Framework for Identification of DNA N4-Methylcytosine Sites in the Mouse Genome</article-title>. <source>Cells</source> <volume>8</volume> (<issue>11</issue>), <fpage>1332</fpage>. <pub-id pub-id-type="doi">10.3390/cells8111332</pub-id> </citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mortezaeefar</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Fotovat</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Shekari</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Sasani</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Comprehensive Understanding of the Interaction Among Stress Hormones Signalling Pathways by Gene Co-expression Network</article-title>. <source>Cbio</source> <volume>14</volume> (<issue>7</issue>), <fpage>602</fpage>&#x2013;<lpage>613</lpage>. <pub-id pub-id-type="doi">10.2174/1574893614666190226160742</pub-id> </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Niu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>rBPDL: Predicting RNA-Binding Proteins Using Deep Learning</article-title>. <source>IEEE J.&#x20;Biomed. Health Inform.</source> (<issue>99</issue>), <fpage>1</fpage>. <pub-id pub-id-type="doi">10.1109/jbhi.2021.3069259</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Niu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>SgRNA-RF: Identification of SgRNA On-Target Activity with Imbalanced Datasets</article-title>. <source>Ieee/acm Trans. Comput. Biol. Bioinf.</source> <volume>105</volume> (<issue>16</issue>), <fpage>1</fpage>. <pub-id pub-id-type="doi">10.1109/tcbb.2021.3079116</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Polat</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>G&#xfc;ne&#x15f;</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>A New Feature Selection Method on Classification of Medical Datasets: Kernel F-Score Feature Selection</article-title>. <source>Expert Syst. Appl.</source> <volume>36</volume> (<issue>7</issue>), <fpage>10367</fpage>&#x2013;<lpage>10373</lpage>. <pub-id pub-id-type="doi">10.1016/j.eswa.2009.01.041</pub-id> </citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Identification of DNA-Binding Proteins Using Mixed Feature Representation Methods</article-title>. <source>Molecules</source> <volume>22</volume> (<issue>10</issue>), <fpage>1602</fpage>. <pub-id pub-id-type="doi">10.3390/molecules22101602</pub-id> </citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Prediction of Diabetic Protein Markers Based on an Ensemble Method</article-title>. <source>Front. Bioscience-Landmark</source> <volume>26</volume> (<issue>7</issue>), <fpage>207</fpage>&#x2013;<lpage>221</lpage>. <pub-id pub-id-type="doi">10.52586/4935</pub-id> </citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Quan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ji</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>A Novel Features Ranking Metric with Application to Scalable Visual and Bioinformatics Data Classification</article-title>. <source>Neurocomputing</source> <volume>173</volume>, <fpage>346</fpage>&#x2013;<lpage>354</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2014.12.123</pub-id> </citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Riaz</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Non-coding RNA Associated Competitive Endogenous RNA Regulatory Network: Novel Therapeutic Approach in Liver Fibrosis</article-title>. <source>Cgt</source> <volume>19</volume> (<issue>5</issue>), <fpage>305</fpage>&#x2013;<lpage>317</lpage>. <pub-id pub-id-type="doi">10.2174/1566523219666191107113046</pub-id> </citation>
</ref>
<ref id="B47">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Schneider</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2012</year>). <source>The Annotation of Plant Proteins in UniProtKB</source>. <publisher-loc>California</publisher-loc>: <publisher-name>Plant &#x26; Animal Genome</publisher-name>. </citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shen</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Basic Polar and Hydrophobic Properties Are the Main Characteristics that Affect the Binding of Transcription Factors to Methylation Sites</article-title>. <source>Bioinformatics</source> <volume>36</volume> (<issue>15</issue>), <fpage>4263</fpage>&#x2013;<lpage>4268</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa492</pub-id> </citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Snow</surname>
<given-names>R. W.</given-names>
</name>
<name>
<surname>Guerra</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>Noor</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Myint</surname>
<given-names>H. Y.</given-names>
</name>
<name>
<surname>Hay</surname>
<given-names>S. I.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>The Global Distribution of Clinical Episodes of Plasmodium Falciparum Malaria - Supplementary Information</article-title>. <source>Nature</source> <volume>434</volume>, <fpage>214</fpage>&#x2013;<lpage>217</lpage>. <pub-id pub-id-type="doi">10.1038/nature03342</pub-id> </citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sohm</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Manfroid</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Pezet</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Rentier-Delrue</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Rand-Weaver</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kelly</surname>
<given-names>P. A.</given-names>
</name>
<etal/>
</person-group> (<year>1998</year>). <article-title>Identification and Modulation of a Growth Hormone-Binding Protein in Rainbow trout (<italic>Oncorhynchus mykiss</italic>) Plasma during Seawater Adaptation</article-title>. <source>Gen. Comp. Endocrinol.</source> <volume>111</volume> (<issue>2</issue>), <fpage>216</fpage>&#x2013;<lpage>224</lpage>. <pub-id pub-id-type="doi">10.1006/gcen.1998.7106</pub-id> </citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Su</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Deep-Resp-Forest: A Deep forest Model to Predict Anti-cancer Drug Response</article-title>. <source>Methods</source> <volume>166</volume>, <fpage>91</fpage>&#x2013;<lpage>102</lpage>. <pub-id pub-id-type="doi">10.1016/j.ymeth.2019.02.009</pub-id> </citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tan</surname>
<given-names>J.-X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S. H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.-H.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.-M.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C.-X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Identification of Hormone Binding Proteins Based on Machine Learning Methods</article-title>. <source>Math. biosciences Eng. MBE</source> <volume>16</volume> (<issue>4</issue>), <fpage>2466</fpage>&#x2013;<lpage>2480</lpage>. <pub-id pub-id-type="doi">10.3934/mbe.2019123</pub-id> </citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Y.-W.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C.-M.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>HBPred: a Tool to Identify Growth Hormone-Binding Proteins</article-title>. <source>Int. J.&#x20;Biol. Sci.</source> <volume>14</volume> (<issue>8</issue>), <fpage>957</fpage>&#x2013;<lpage>964</lpage>. <pub-id pub-id-type="doi">10.7150/ijbs.24174</pub-id> </citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A Drug Target Interaction Prediction Based on LINE-RF Learning</article-title>. <source>Cbio</source> <volume>15</volume> (<issue>7</issue>), <fpage>750</fpage>&#x2013;<lpage>757</lpage>. <pub-id pub-id-type="doi">10.2174/1574893615666191227092453</pub-id> </citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Predicting Drug-Target Interactions via FM-DNN Learning</article-title>. <source>Cbio</source> <volume>15</volume> (<issue>1</issue>), <fpage>68</fpage>&#x2013;<lpage>76</lpage>. <pub-id pub-id-type="doi">10.2174/1574893614666190227160538</pub-id> </citation>
</ref>
<ref id="B56">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Hou</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Identification of Hormone-Binding Proteins Using a Novel Ensemble Classifier</article-title>. <source>Computing</source> <volume>101</volume> (<issue>6</issue>), <fpage>693</fpage>&#x2013;<lpage>703</lpage>. <pub-id pub-id-type="doi">10.1007/s00607-018-0682-x</pub-id> </citation>
</ref>
<ref id="B57">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>X.-F.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.-F.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H.-F.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Predicting Thermophilic Proteins by Machine Learning</article-title>. <source>Cbio</source> <volume>15</volume> (<issue>5</issue>), <fpage>493</fpage>&#x2013;<lpage>502</lpage>. <pub-id pub-id-type="doi">10.2174/1574893615666200207094357</pub-id> </citation>
</ref>
<ref id="B58">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wei</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Luan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Manavalan</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Iterative Feature Representations Improve N4-Methylcytosine Site Prediction</article-title>. <source>Bioinformatics</source> <volume>35</volume> (<issue>23</issue>), <fpage>4930</fpage>&#x2013;<lpage>4937</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btz408</pub-id> </citation>
</ref>
<ref id="B59">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wei</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Integration of Deep Feature Representations and Handcrafted Features to Improve the Prediction of N 6&#x20;-methyladenosine Sites</article-title>. <source>Neurocomputing</source> <volume>324</volume>, <fpage>S0925231218306325</fpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2018.04.082</pub-id> </citation>
</ref>
<ref id="B60">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xiao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Prediction of lncRNA-Protein Interactions Using HeteSim Scores Based on Heterogeneous Networks</article-title>. <source>Sci. Rep.</source> <volume>7</volume> (<issue>1</issue>), <fpage>3664</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-017-03986-1</pub-id> </citation>
</ref>
<ref id="B61">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yan</surname>
<given-names>XY</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>SW</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>SY</given-names>
</name>
</person-group>. <year>2016</year>. <article-title>Prediction of Drug-Target Interaction by Label Propagation with Mutual Interaction Information Derived from Heterogeneous Network</article-title>. <source>Mol. Biosyst.</source> <volume>12</volume>, <fpage>520</fpage>&#x2013;<lpage>531</lpage>. <pub-id pub-id-type="doi">10.1039/c5mb00615e</pub-id> </citation>
</ref>
<ref id="B62">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Risk Prediction of Diabetes: Big Data Mining with Fusion of Multifarious Physical Examination Indicators</article-title>. <source>Inf. Fusion</source> <volume>75</volume>, <fpage>140</fpage>&#x2013;<lpage>149</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2021.02.015</pub-id> </citation>
</ref>
<ref id="B63">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zeng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Identification of Cytokine via an Improved Genetic Algorithm</article-title>. <source>Front. Comp. Sci.</source> <volume>9</volume> (<issue>004</issue>), <fpage>643</fpage>&#x2013;<lpage>651</lpage>. <pub-id pub-id-type="doi">10.1007/s11704-014-4089-3</pub-id> </citation>
</ref>
<ref id="B64">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zeng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhong</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Predicting Disease-Associated Circular RNAs Using Deep Forests Combined with Positive-Unlabeled Learning Methods</article-title>. <source>Brief. Bioinform.</source> <volume>21</volume> (<issue>4</issue>), <fpage>1425</fpage>&#x2013;<lpage>1436</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbz080</pub-id> </citation>
</ref>
<ref id="B65">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ju</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Xuan</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Accurate Identification of Cancerlectins through Hybrid Machine Learning Technology</article-title>. <source>Int. J.&#x20;Genomics</source> <volume>2016</volume> (<issue>7-13</issue>), <fpage>1</fpage>&#x2013;<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1155/2016/7604641</pub-id> </citation>
</ref>
<ref id="B66">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Qu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>The Linear Neighborhood Propagation Method for Predicting Long Non-coding RNA&#x2013;Protein Interactions</article-title>. <source>Neurocomputing</source> <volume>273</volume> (<issue>jan.17</issue>), <fpage>526</fpage>&#x2013;<lpage>534</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2017.07.065</pub-id> </citation>
</ref>
<ref id="B67">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Functional Immunoregulation by Heme Oxygenase 1 in Juvenile Autoimmune Diseases</article-title>. <source>Cgt</source> <volume>19</volume> (<issue>2</issue>), <fpage>110</fpage>&#x2013;<lpage>116</lpage>. <pub-id pub-id-type="doi">10.2174/1566523219666190710092935</pub-id> </citation>
</ref>
<ref id="B68">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Marchant</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>1999</year>). <article-title>Identification of Serum GH-Binding Proteins in the Goldfish (Carassius auratus) and Comparison with Mammalian GH-Binding Proteins</article-title>. <source>J.&#x20;Endocrinol.</source> <volume>161</volume> (<issue>2</issue>), <fpage>255</fpage>&#x2013;<lpage>262</lpage>. <pub-id pub-id-type="doi">10.1677/joe.0.1610255</pub-id> </citation>
</ref>
<ref id="B69">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ju</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Pretata: Predicting TATA Binding Proteins with Novel Features and Dimensionality Reduction Strategy</article-title>. <source>BMC Syst. Biol.</source> <volume>10</volume> (<issue>4</issue>), <fpage>114</fpage>. <pub-id pub-id-type="doi">10.1186/s12918-016-0353-5</pub-id> </citation>
</ref>
<ref id="B70">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Sequence Clustering in Bioinformatics: an Empirical Study</article-title>. <source>Brief. Bioinform.</source> <volume>21</volume> (<issue>1</issue>), <fpage>1</fpage>&#x2013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bby090</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>