<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Med.</journal-id>
<journal-title>Frontiers in Medicine</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Med.</abbrev-journal-title>
<issn pub-type="epub">2296-858X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmed.2023.1291352</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Medicine</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Deep-STP: a deep learning-based approach to predict snake toxin proteins by using word embeddings</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Zulfiqar</surname>
<given-names>Hasan</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1101292/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Guo</surname>
<given-names>Zhiling</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Ahmad</surname>
<given-names>Ramala Masood</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Ahmed</surname>
<given-names>Zahoor</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1533846/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Cai</surname>
<given-names>Peiling</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1734004/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Chen</surname>
<given-names>Xiang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zhang</surname>
<given-names>Yang</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2176592/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Lin</surname>
<given-names>Hao</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/182351/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Shi</surname>
<given-names>Zheng</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<xref ref-type="corresp" rid="c003"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1330752/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Yangtze Delta Region Institute (Huzhou), University of Electronic Science and Technology of China</institution>, <addr-line>Huzhou, Zhejiang</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Beidahuang Industry Group General Hospital</institution>, <addr-line>Harbin</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Department of Plant Breeding and Genetics, University of Agriculture Faisalabad</institution>, <addr-line>Faisalabad</addr-line>, <country>Pakistan</country></aff>
<aff id="aff4"><sup>4</sup><institution>School of Basic Medical Sciences, Chengdu University</institution>, <addr-line>Chengdu</addr-line>, <country>China</country></aff>
<aff id="aff5"><sup>5</sup><institution>Innovative Institute of Chinese Medicine and Pharmacy, Academy for Interdiscipline, Chengdu University of Traditional Chinese Medicine</institution>, <addr-line>Chengdu</addr-line>, <country>China</country></aff>
<aff id="aff6"><sup>6</sup><institution>Clinical Genetics Laboratory, Clinical Medical College &#x0026; Affiliated Hospital, Chengdu University</institution>, <addr-line>Chengdu</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0002"><p>Edited by: Waqas Nazeer, University of Education, Winneba, Ghana</p></fn>
<fn fn-type="edited-by" id="fn0003"><p>Reviewed by: Santosh Panjikar, Australian Synchrotron, Australia; Wang-Ren Qiu, Jingdezhen Ceramic Institute, China</p></fn>
<corresp id="c001">&#x002A;Correspondence: Yang Zhang, <email>yangzhang@cdutcm.edu.cn</email></corresp>
<corresp id="c002">Hao Lin, <email>hlin@uestc.edu.cn</email></corresp>
<corresp id="c003">Zheng Shi, <email>drshiz1002@hotmail.com</email></corresp>
<fn fn-type="equal" id="fn0001"><p><sup>&#x2020;</sup>These authors have contributed equally to this work</p></fn>
</author-notes>
<pub-date pub-type="epub">
<day>17</day>
<month>01</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>10</volume>
<elocation-id>1291352</elocation-id>
<history>
<date date-type="received">
<day>09</day>
<month>09</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>12</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2024 Zulfiqar, Guo, Ahmad, Ahmed, Cai, Chen, Zhang, Lin and Shi.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Zulfiqar, Guo, Ahmad, Ahmed, Cai, Chen, Zhang, Lin and Shi</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Snake venom contains many toxic proteins that can destroy the circulatory system or nervous system of prey. Studies have found that these snake venom proteins have the potential to treat cardiovascular and nervous system diseases. Therefore, the study of snake venom protein is conducive to the development of related drugs. The research technologies based on traditional biochemistry can accurately identify these proteins, but the experimental cost is high and the time is long. Artificial intelligence technology provides a new means and strategy for large-scale screening of snake venom proteins from the perspective of computing. In this paper, we developed a sequence-based computational method to recognize snake toxin proteins. Specially, we utilized three different feature descriptors, namely <italic>g-gap</italic>, natural vector and word 2 vector, to encode snake toxin protein sequences. The analysis of variance (ANOVA), gradient-boost decision tree algorithm (GBDT) combined with incremental feature selection (IFS) were used to optimize the features, and then the optimized features were input into the deep learning model for model training. The results show that our model can achieve a prediction performance with an accuracy of 82.00% in 10-fold cross-validation. The model is further verified on independent data, and the accuracy rate reaches to 81.14%, which demonstrated that our model has excellent prediction performance and robustness.</p>
</abstract>
<kwd-group>
<kwd>snake toxin</kwd>
<kwd>deep learning</kwd>
<kwd>feature vectors</kwd>
<kwd>word embedding</kwd>
<kwd>feature selection</kwd>
<kwd>ANOVA</kwd>
</kwd-group>
<counts>
<fig-count count="5"/>
<table-count count="2"/>
<equation-count count="8"/>
<ref-count count="36"/>
<page-count count="7"/>
<word-count count="4178"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Precision Medicine</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>Snake venom is a mixture of toxin proteins and other chemical molecules, which acts on the blood circulation system, nervous system or motion system of prey. It can make the prey lose resistance, and then achieve the purpose of predation. Many toxin enzymes have been isolated from snake venoms, such as serine proteinases, metalloproteinase and L-amino acid oxidases, which can interrupt the blood circulatory system, leading to blood clotting and heart failure. Moreover, the scientists found that the primary toxins of <italic>Pseudechis australis</italic> venom with antibacterial activity were phospholipases A2 and L-amino acid oxidases. The L-amino acid oxidase discovered in the venom of <italic>Crotalus adamanteus</italic> was the first pure toxin tested against bacteria. Since then, crude snake venom, portions of it, or refined components have all shown antibacterial activity. The mechanism of anti-microbial activity of snake toxin proteins is shown in <xref ref-type="fig" rid="fig1">Figure 1</xref>.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Schematic diagram of the anti-microbial activities of snake toxin proteins.</p>
</caption>
<graphic xlink:href="fmed-10-1291352-g001.tif"/>
</fig>
<p>Many toxin proteins were found in snake venom, such as phospholipases A<sub>2</sub>, cysteine-rich secretory proteins (CRISP), &#x03B1;-dendrotoxins, &#x03B2;-dendrotoxins and &#x03B3;-dendrotoxins which could interact with nervous system or molecules in nervous system (<xref ref-type="bibr" rid="ref1">1</xref>, <xref ref-type="bibr" rid="ref2">2</xref>). Scientists have also obtained some venomous proteins, for example, three finger &#x03B1;-neurotoxins (&#x03B1;-3FNTx) and acetylcholine esterase proteins, which target motion system of prey and cause paralysis (<xref ref-type="bibr" rid="ref3">3</xref>). Surprisingly, the components extracted from snakes can be used as drugs to cure various diseases (<xref ref-type="bibr" rid="ref4">4</xref>). At present, scientists have extracted several drugs from snake toxin proteins for the treatment of heart related syndromes. For example, captopril is now used to treat hypertension and reduce the risk of heart failure after the heart attack (<xref ref-type="bibr" rid="ref5">5</xref>). Therefore, the correct identification of snake venom protein is very important for the study of drug development based on snake venom. Biochemical technologies are complicated, tedious and expensive. Thus, there is an urgent need to develop bioinformatic tools that can precisely identify snake toxins in a short time. Current bioinformatic tools, such as FASTA (<xref ref-type="bibr" rid="ref6">6</xref>), HAlign (<xref ref-type="bibr" rid="ref7">7</xref>, <xref ref-type="bibr" rid="ref8">8</xref>) and BLAST (<xref ref-type="bibr" rid="ref9">9</xref>) can search for similar sequences with the help of known protein databases. However, in the absence of homologous sequences in benchmark dataset, these computational tools cannot correctly recognize snake toxin proteins. Therefore, it is essential to establish a computational tool to recognize snake toxin proteins.</p>
<p>To fill the gap, we proposed the first predictor named Deep-STP based on deep learning to recognize snake toxin proteins. The graphical illustration of the entire study was shown in <xref ref-type="fig" rid="fig2">Figure 2</xref>. First, the snake toxin protein sequences were encoded by three different kinds of descriptors, namely, word to vector (<xref ref-type="bibr" rid="ref10">10</xref>), <italic>g-gap</italic> and natural vector (<xref ref-type="bibr" rid="ref11">11</xref>). Subsequently, the feature set was optimized by combining ANOVA (<xref ref-type="bibr" rid="ref11">11</xref>) and GBDT (<xref ref-type="bibr" rid="ref12">12</xref>) with IFS procedure. By inputting the optimal feature into deep learning, the snake toxin proteins can be recognized. The performance of the anticipated model was evaluated by 10-fold CV and independent data.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>The graphical illustration of the entire study.</p>
</caption>
<graphic xlink:href="fmed-10-1291352-g002.tif"/>
</fig>
</sec>
<sec sec-type="materials|methods" id="sec2">
<label>2</label>
<title>Materials and methods</title>
<p>A real and reliable data is crucial for the establishment of prediction model. In this work, positive and negative samples were collected from open-source database UniProt (<xref ref-type="bibr" rid="ref13">13</xref>) and RefSeq (<xref ref-type="bibr" rid="ref14">14</xref>). We have excluded the similar sequences using 80% as cutoff of sequence identity (<xref ref-type="bibr" rid="ref15">15</xref>). After the elimination process, we finally obtained the dataset of 270 positive and 339 negative sequences of the prominent protein families of snake toxin. Subsequently, the data were separated into 80% training data and 20% independent data to objectively estimate the efficiencies and performances of the models, as shown in <xref ref-type="supplementary-material" rid="SM1">Supplementary Table S1</xref>.</p>
<sec id="sec3">
<label>2.1</label>
<title>Feature descriptors</title>
<p>It is an important step for protein function prediction to express the sequence information with effective mathematical descriptors (<xref ref-type="bibr" rid="ref16">16</xref>). Here, three kinds of feature descriptors were used to encode the snake toxin protein sequences.</p>
<sec id="sec4">
<label>2.1.1</label>
<title>g-gap dipeptide composition</title>
<p>The relationship between the two end-to-end 2-D amino acid residues can be expressed using this feature encoding approach. Consequently, important links between two residues are found using <italic>g-gap</italic> dipeptide composition. Thus, a protein &#x2018;<italic>F</italic>&#x2019; can be described as</p><disp-formula id="EQ1">
<label>(1)</label>
<mml:math id="M1">
<mml:mi>F</mml:mi>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mfenced open="[" close="]" separators=",,,,">
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
<mml:mi>p</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>p</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mn>3</mml:mn>
<mml:mi>p</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>p</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mn>400</mml:mn>
<mml:mi>p</mml:mi>
</mml:msubsup>
</mml:mfenced>
<mml:mi>t</mml:mi>
</mml:msup>
</mml:math>
</disp-formula><p>where &#x2018;<italic>t</italic>&#x2019; is the transposition vector and <inline-formula>
<mml:math id="M2">
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>p</mml:mi>
</mml:msubsup>
</mml:math>
</inline-formula> is the <italic>i</italic>-th occurrence of <italic>g-gap</italic> dipeptide which is define as</p><disp-formula id="EQ2">
<label>(2)</label>
<mml:math id="M3">
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>p</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:msubsup>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>p</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:math>
</disp-formula><p>where &#x2018;<italic>p</italic>&#x2019; is the number of amino acid residues, <inline-formula>
<mml:math id="M4">
<mml:msubsup>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>p</mml:mi>
</mml:msubsup>
</mml:math>
</inline-formula> is the <italic>i</italic>-th value number of <italic>g-gap</italic> and &#x2018;<italic>L</italic>&#x2019; is the length of &#x2018;<italic>F</italic>&#x2019; protein.</p>
</sec>
<sec id="sec5">
<label>2.1.2</label>
<title>Natural vector</title>
<p>As a starting point for phylogenetic and evolutionary study, the natural vector scheme (NV) was created by Deng et al. (<xref ref-type="bibr" rid="ref17">17</xref>). Here, we have also used NV to formulate the samples. A 60-dimensional vector can be created using this approach to plot biological sequences. The NV scheme has a significant ability to classify proteins because it has no parameters (<xref ref-type="bibr" rid="ref18">18</xref>).</p>
<p>Let us say a protein &#x2018;<italic>P</italic>&#x2019; with a length of &#x2018;<italic>L</italic>&#x2019; residues can be expressed as.</p><disp-formula id="E1">
<label>(3)</label>
<mml:math id="M5">
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">Q</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mi mathvariant="normal">Q</mml:mi>
<mml:mn>2</mml:mn>
<mml:mo>&#x2026;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">Q</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
<mml:mo>&#x2026;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">Q</mml:mi>
<mml:mi mathvariant="normal">L</mml:mi>
</mml:msub>
</mml:math>
</disp-formula><p>where <italic>Q</italic>i (i&#x2009;=&#x2009;(1, 2, &#x2026; L)) indicates the i-th amino acid of protein &#x2018;<italic>P</italic>&#x2019;. The NV is expressed as.</p>
<p>w<italic><sub>k</sub></italic> (.): (A, C, D, E&#x2026;W, Y)&#x2009;&#x2192;&#x2009;(0,1).where w<italic><sub>k</sub></italic> (<italic>Q<sub>i</sub></italic>)&#x2009;=&#x2009;1, if <italic>Q<sub>i</sub> =&#x2009;k.</italic> otherwise, w<italic><sub>k</sub></italic> (<italic>Q<sub>i</sub></italic>)&#x2009;=&#x2009;0.</p>
<p>In protein &#x2018;<italic>P</italic>&#x2019;, <italic>m<sub>k</sub></italic> is the number of <italic>k-</italic>th amino acid which can be computed as</p><disp-formula id="EQ3">
<label>(4)</label>
<mml:math id="M6">
<mml:mi>m</mml:mi>
<mml:mi>k</mml:mi>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>L</mml:mi>
</mml:munderover>
<mml:mi mathvariant="normal">w</mml:mi>
<mml:mi>k</mml:mi>
<mml:mspace width="0.25em"/>
<mml:mfenced open="(" close=")">
<mml:mi mathvariant="italic">Qi</mml:mi>
</mml:mfenced>
</mml:math>
</disp-formula><p>Let <italic>T</italic><sub>(<italic>k</italic>)(<italic>i</italic>)</sub> is the gap between the first and <italic>i</italic>-th amino acid, <italic>&#x03B7;<sub>k</sub></italic> is the mean of the amino acids <italic>k</italic> and <italic>S<sub>k</sub></italic> is the overall distance which is shown in <xref ref-type="disp-formula" rid="EQ4">equation (5)</xref>.</p><disp-formula id="EQ4">
<label>(5)</label>
<mml:math id="M7">
<mml:mo stretchy="true">{</mml:mo>
<mml:mtable columnalign="left">
<mml:mtr columnalign="left">
<mml:mtd columnalign="left">
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mi>k</mml:mi>
</mml:mfenced>
<mml:mfenced open="(" close=")">
<mml:mi>i</mml:mi>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:msub>
<mml:mi>Q</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr columnalign="left">
<mml:mtd columnalign="left">
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mi>k</mml:mi>
</mml:mfenced>
<mml:mfenced open="(" close=")">
<mml:mi>i</mml:mi>
</mml:mfenced>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
<mml:mtr columnalign="left">
<mml:mtd columnalign="left">
<mml:msub>
<mml:mi>&#x03B7;</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo stretchy="true">/</mml:mo>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
</disp-formula>
<p>Let &#x2018;<inline-formula>
<mml:math id="M8">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>k</mml:mi>
</mml:msubsup>
</mml:math>
</inline-formula>&#x2019; is the 2nd order regularized moment, which is computed as</p><disp-formula id="EQ5">
<label>(6)</label>
<mml:math id="M9">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:mfrac>
<mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>k</mml:mi>
</mml:mfenced>
<mml:mfenced open="(" close=")">
<mml:mi>i</mml:mi>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="normal">&#x03B7;</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>k</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
</disp-formula><p>Thus, &#x2018;<italic>P</italic>&#x2019; can be termed as</p><disp-formula id="EQ6">
<label>(7)</label>
<mml:math id="M10">
<mml:mi>P</mml:mi>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>A</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x03B7;</mml:mi>
<mml:mi>A</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>A</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x03B7;</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mi>Y</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x03B7;</mml:mi>
<mml:mi>Y</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mn>2</mml:mn>
<mml:mi>Y</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:math>
</disp-formula><p>where &#x2018;<italic>T</italic>&#x2019; is the vector transposition.</p>
</sec>
<sec id="sec6">
<label>2.1.3</label>
<title>Word2Vector</title>
<p>The &#x2018;word2vector&#x2019; (W2V) is a NLP (Natural language processing) technique which has the ability to utilize neural networks to produce illustrations of the distribution of words (<xref ref-type="bibr" rid="ref19">19</xref>, <xref ref-type="bibr" rid="ref20">20</xref>). In this method, word embeddings are utilized to illustrate of words. Indeed, the vectors which have the ability to encode the words closer in the vector space are supposed to be an identical meaning. The &#x2018;word2vector&#x2019; consists of two different kinds of models, namely, continuous bag of words (<xref ref-type="bibr" rid="ref21">21</xref>) and the other one is continuous skip gram (<xref ref-type="bibr" rid="ref22">22</xref>). The main idea of the continuous skip gram is to utilize the words to predict its adjoining words (<xref ref-type="bibr" rid="ref23">23</xref>). The quantified intelligence of continuous bag of words uses context words from a nearby booth to predict words. The continuous bag of words model structure logically implies the advantage of consistently condensing the scattered information in the data. Thus, in this work, we employed the continuous bag of words to train the appropriate resemblance of protein sequences. The dimension of the word2vector embedding is 200.</p>
</sec>
</sec>
<sec id="sec7">
<label>2.2</label>
<title>Feature selection</title>
<p>The redundancy in the feature vectors can produce unsatisfactory performance (<xref ref-type="bibr" rid="ref24">24</xref>). Therefore, selecting the ideal features is a significant step to eliminate the irrelevant features and enhance the efficiency of the model (<xref ref-type="bibr" rid="ref25">25</xref>). There are many feature selection and ranking methods to optimize the features, such as ANOVA (<xref ref-type="bibr" rid="ref26">26</xref>, <xref ref-type="bibr" rid="ref27">27</xref>), F-score (<xref ref-type="bibr" rid="ref28">28</xref>), mRMR (<xref ref-type="bibr" rid="ref29">29</xref>), GBDT and LGBM (<xref ref-type="bibr" rid="ref12">12</xref>). ANOVA is a reputable choice to overcome these complications, because it takes short time and yield effective outcomes. The merging of top-performing features does not guarantee that the best outcomes can be achieved. These features are conceivably to have a higher level of redundancy, which leads to another unnecessary knowledge in the feature. Hence, GBDT is an ideal choice to conquer these hitches. In this work, ANOVA and GBDT with IFS were employed to achieve the best feature subset which could produce the maximum accuracy. The whole procedure for feature selection has been already elucidated in our previous study (<xref ref-type="bibr" rid="ref12">12</xref>). The prediction accuracy of models constructed with different numbers of features and contribution of feature descriptors have been shown in <xref ref-type="fig" rid="fig3">Figures 3A</xref>,<xref ref-type="fig" rid="fig3">B</xref>.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>The prediction accuracy of models constructed with different numbers of features <bold>(A)</bold>. Contribution of descriptors in CNN-based fusion model to classify snake toxin proteins <bold>(B)</bold>.</p>
</caption>
<graphic xlink:href="fmed-10-1291352-g003.tif"/>
</fig>
</sec>
<sec id="sec8">
<label>2.3</label>
<title>Convolutional neural network</title>
<p>Convolutional neural networks (CNN) was first developed by LeCun et al. (<xref ref-type="bibr" rid="ref30">30</xref>) and are now largely used in the developments of biology and bioinformatics (<xref ref-type="bibr" rid="ref31">31</xref>). The core idea behind CNN is to use layer-wise convolutions and pooling techniques to build a large number of filters that can extract hidden topological properties from input. The performance of CNN on 2-D image and matrix data has been excellent (<xref ref-type="bibr" rid="ref32">32</xref>). Moreover, 1-D CNN has been utilized to overcome the natural language processing and biomedical sequence data recognition problems (<xref ref-type="bibr" rid="ref33">33</xref>). In this work, we executed 1-D CNN to recognize snake toxin proteins. We utilized Keras 2.3.1 (<xref ref-type="bibr" rid="ref34">34</xref>), Python 3.5.4 and Tensor Flow 2.1.0 to execute this experimentation.</p>
</sec>
<sec id="sec9">
<label>2.4</label>
<title>Metrics evaluation</title>
<p>Accuracy, precision, recall and F1-score (<xref ref-type="bibr" rid="ref35">35</xref>) were used to assess the efficiency of the projected model and can be expressed as</p><disp-formula id="EQ7">
<label>(8)</label>
<mml:math id="M11">
<mml:mo stretchy="true">{</mml:mo>
<mml:mtable columnalign="left">
<mml:mtr columnalign="left">
<mml:mtd columnalign="left">
<mml:mi mathvariant="italic">Precision</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
<mml:mtr columnalign="left">
<mml:mtd columnalign="left">
<mml:mi mathvariant="italic">Recall</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
<mml:mtr columnalign="left">
<mml:mtd columnalign="left">
<mml:mi mathvariant="italic">Accuracy</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
<mml:mtr columnalign="left">
<mml:mtd columnalign="left">
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>=</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="italic">Precision</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi mathvariant="italic">Recall</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">Precision</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="italic">Recall</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
</disp-formula><p>where &#x2018;<italic>TP</italic>&#x2019; represents the truly predicted snake toxin protein sequences and &#x2018;<italic>FP</italic>&#x2019; indicates the non-snake toxin protein sequences predicted as snake toxin protein sequence. &#x2018;<italic>TN</italic>&#x2019; symbolizes the truly predicted non-snake toxin protein sequences and &#x2018;<italic>FN</italic>&#x2019; demonstrate the snake toxin protein sequences which were predicted as non-snake toxin protein sequence.</p>
</sec>
</sec>
<sec sec-type="results" id="sec10">
<label>3</label>
<title>Results and discussion</title>
<sec id="sec11">
<label>3.1</label>
<title>Performance evaluation</title>
<p>Initially, we converted the sequence data into feature vectors by using three types of feature encoding schemes. Then, each feature vector was assessed by CNN-based classifier by employing a 10-fold CV. Subsequently, ANOVA and GBDT were implemented to select the optimal feature. <xref ref-type="fig" rid="fig3">Figure 3A</xref> displays the prediction accuracy of models constructed with different numbers of features. The maximum accuracy of 82.00% was achieved on 167 optimal features. <xref ref-type="fig" rid="fig3">Figure 3B</xref> shows the contribution of feature descriptors in CNN-based fusion model. The optimal model was trained on the data with 167 features derived from three kinds of descriptors. In final optimized-fusion model, NV, W2V and <italic>g-gap</italic> dipeptide descriptors account for 35.92, 43.11, and 20.95%, respectively. We have also visualized the feature fusions by using <italic>t</italic>-SNE (<italic>t</italic>-distributed stochastic neighbor embedding) technique. The <italic>t</italic>-SNE visualization of feature fusion before and after the feature selection are shown in <xref ref-type="fig" rid="fig4">Figures 4A</xref>,<xref ref-type="fig" rid="fig4">B</xref>. <xref ref-type="fig" rid="fig4">Figure 4C</xref> shows the single-encoding performance on different machine learning-based (ML-based) classifiers before the selection of features (<xref ref-type="bibr" rid="ref36">36</xref>) and <xref ref-type="fig" rid="fig4">Figure 4D</xref> shows the performance of single-encoding after feature selections on different ML-based classifiers. <xref ref-type="table" rid="tab1">Table 1</xref> also shows the performance of feature fusion models before and after the feature selection on different ML-based classifiers by utilizing 10-fold CV.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Visualization of feature fusion before the feature selection <bold>(A)</bold>. Visualization of feature fusion after the feature selection <bold>(B)</bold>. Performance of single-encoded features on different classifiers before the feature selection <bold>(C)</bold>. Performance of single-encoded features on different classifiers after the feature selection <bold>(D)</bold>. Comparison of proposed CNN-based fusion model with different machine learning-based fusion models on the basis of 10-fold CV <bold>(E)</bold>. Comparison of proposed CNN-based fusion model with different machine learning-based fusion models on independent data <bold>(F)</bold>.</p>
</caption>
<graphic xlink:href="fmed-10-1291352-g004.tif"/>
</fig>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Performance of fusion models by using different algorithms.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Algorithm</th>
<th align="left" valign="top">FS</th>
<th align="center" valign="top">Dimension</th>
<th align="center" valign="top">Accuracy</th>
<th align="center" valign="top">Recall</th>
<th align="center" valign="top">Precision</th>
<th align="center" valign="top">F1</th>
<th align="center" valign="top">AUROC</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top" rowspan="2">RF</td>
<td align="left" valign="top">No</td>
<td align="center" valign="top">487</td>
<td align="center" valign="top">77.35</td>
<td align="center" valign="top">76.84</td>
<td align="center" valign="top">78.21</td>
<td align="center" valign="top">78.87</td>
<td align="center" valign="top">0.863</td>
</tr>
<tr>
<td align="left" valign="top">Yes</td>
<td align="center" valign="top">189</td>
<td align="center" valign="top">79.80</td>
<td align="center" valign="top">80.10</td>
<td align="center" valign="top">78.88</td>
<td align="center" valign="top">79.56</td>
<td align="center" valign="top">0.881</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="2">LSTM</td>
<td align="left" valign="top">No</td>
<td align="center" valign="top">487</td>
<td align="center" valign="top">79.74</td>
<td align="center" valign="top">79.68</td>
<td align="center" valign="top">80.20</td>
<td align="center" valign="top">78.89</td>
<td align="center" valign="top">0.895</td>
</tr>
<tr>
<td align="left" valign="top">Yes</td>
<td align="center" valign="top">227</td>
<td align="center" valign="top">80.50</td>
<td align="center" valign="top">80.37</td>
<td align="center" valign="top">80.08</td>
<td align="center" valign="top">79.00</td>
<td align="center" valign="top">0.901</td>
</tr>
<tr>
<td align="left" valign="top" rowspan="2">CNN</td>
<td align="left" valign="top">No</td>
<td align="center" valign="top">487</td>
<td align="center" valign="top">81.22</td>
<td align="center" valign="top">83.11</td>
<td align="center" valign="top">78.01</td>
<td align="center" valign="top">79.88</td>
<td align="center" valign="top">0.904</td>
</tr>
<tr>
<td align="left" valign="top">Yes</td>
<td align="center" valign="top">167</td>
<td align="center" valign="top">82.00</td>
<td align="center" valign="top">84.17</td>
<td align="center" valign="top">79.32</td>
<td align="center" valign="top">80.73</td>
<td align="center" valign="top">0.926</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The comparisons of proposed CNN-based fusion model with different machine learning-based fusion models on 10-fold CV as well as on independent dataset are shown in <xref ref-type="fig" rid="fig4">Figures 4E</xref>,<xref ref-type="fig" rid="fig4">F</xref>. From these comparisons, we may conclude that the best model is based on the CNN with 167 optimal features. The model could produce the AUROC of 0.926 and 0.917 on training and independent dataset.</p>
</sec>
<sec id="sec12">
<label>3.2</label>
<title>Performance evaluation of different ML algorithms</title>
<p>Various single feature and their fusion were inputted into other ML-based classifiers, such as long short-term memory (LSTM) and random forest (RF), for determining which machine learning method is the best for snake toxin prediction. The 10-fold CV and independent dataset test were employed to estimate the efficiency of these models. The comparison outcomes have been shown in <xref ref-type="table" rid="tab1">Tables 1</xref>, <xref ref-type="table" rid="tab2">2</xref>. We noticed that the AUROC of CNN-based prediction model was 2.5&#x2013;4.5% higher than that of other classifiers on 10-fold CV and 1.7&#x2013;4.1% higher than that of other classifiers on independent test. <xref ref-type="fig" rid="fig5">Figures 5A</xref>&#x2013;<xref ref-type="fig" rid="fig5">D</xref> displayed that the CNN-based prediction model is best among all classifiers.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Performance of fusion models on independent data.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Algorithm</th>
<th align="center" valign="top">Accuracy</th>
<th align="center" valign="top">Recall</th>
<th align="center" valign="top">Precision</th>
<th align="center" valign="top">F1</th>
<th align="center" valign="top">AUROC</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">RF</td>
<td align="center" valign="top">78.20</td>
<td align="center" valign="top">77.84</td>
<td align="center" valign="top">78.28</td>
<td align="center" valign="top">78.56</td>
<td align="center" valign="top">0.876</td>
</tr>
<tr>
<td align="left" valign="top">LSTM</td>
<td align="center" valign="top">80.10</td>
<td align="center" valign="top">80.25</td>
<td align="center" valign="top">79.89</td>
<td align="center" valign="top">80.09</td>
<td align="center" valign="top">0.900</td>
</tr>
<tr>
<td align="left" valign="top">CNN</td>
<td align="center" valign="top">81.14</td>
<td align="center" valign="top">82.08</td>
<td align="center" valign="top">79.26</td>
<td align="center" valign="top">79.91</td>
<td align="center" valign="top">0.917</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>AUROC of the best performing model on 10-fold CV <bold>(A)</bold>. AUROC of the best performing model on independent data <bold>(B)</bold>. Comparison of different machine learning-based models on 10-fold CV <bold>(C)</bold>. Comparison of different machine learning-based models on independent data <bold>(D)</bold>.</p>
</caption>
<graphic xlink:href="fmed-10-1291352-g005.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="conclusions" id="sec13">
<label>4</label>
<title>Conclusion</title>
<p>Snake venom is a mixture of deadly proteins that can anesthetize and kill prey. Scientists have found a variety of proteins with potential pharmacological uses from snake venom. Further research on snake venom protein will contribute to drug development. In this work, an innovative computational model was constructed to classify snake toxin proteins. NV, W2V, and <italic>g-gap</italic> were utilized to encode the protein sequences. Subsequently, optimal feature subset was obtained by ANOVA and GBDT with IFS. By comparing different machine learning-based models, the best model was attained by the CNN-based classifier. Furthermore, the results showed that the proposed model could provide spectacular generalization ability. The dataset and codes are available at <ext-link xlink:href="https://github.com/linDing-groups/Deep-STP" ext-link-type="uri">https://github.com/linDing-groups/Deep-STP</ext-link>. Further studies will focus on constructing a web application for the anticipated model. Moreover, other advance feature selection techniques and algorithms will be employed to further increase the efficiency of classification.</p>
</sec>
<sec sec-type="data-availability" id="sec14">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec sec-type="author-contributions" id="sec15">
<title>Author contributions</title>
<p>HZ: Conceptualization, Experimentation, Methodology, Visualization, Writing&#x2014;original draft preparation. ZG: Data curation, Methodology, Experimentation. RMA: Data curation, Experimentation, Methodology, Visualization. ZA: Data curation, Methodology, Visualization. PC: Data curation, Visualization. XC: Methodology. YZ: Methodology, Writing &#x2013; review &#x0026; editing. HL: Conceptualization, Supervision, Writing &#x2013; review &#x0026; editing. ZS: Conceptualization, Writing &#x2013; review &#x0026; editing. All authors have read and agreed to the published version of the manuscript.</p>
</sec>
</body>
<back>
<sec sec-type="funding-information" id="sec16">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work has been supported by the National Nature Scientific Foundation of China (62302079), the Natural Science Foundation of Sichuan Province (2022NSFSC1610) and funding of Cells and Regenerative Medicine Innovation Team (CDFYCX202208).</p>
</sec>
<sec sec-type="COI-statement" id="sec17">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="sec100" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec18">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fmed.2023.1291352/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fmed.2023.1291352/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.DOCX" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><label>1.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Osipov</surname> <given-names>AV</given-names></name> <name><surname>Utkin</surname> <given-names>YN</given-names></name></person-group>. <article-title>Snake venom toxins targeted at the nervous system</article-title>. <source>Snake Venoms Toxinol</source>. (<year>2017</year>):<fpage>189</fpage>&#x2013;<lpage>214</lpage>. doi: <pub-id pub-id-type="doi">10.1007/978-94-007-6410-1_23</pub-id></citation></ref>
<ref id="ref2"><label>2.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yamazaki</surname> <given-names>Y</given-names></name> <name><surname>Morita</surname> <given-names>T</given-names></name></person-group>. <article-title>Structure and function of snake venom cysteine-rich secretory proteins</article-title>. <source>Toxicon</source>. (<year>2004</year>) <volume>44</volume>:<fpage>227</fpage>&#x2013;<lpage>31</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.toxicon.2004.05.023</pub-id></citation></ref>
<ref id="ref3"><label>3.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nirthanan</surname> <given-names>S</given-names></name></person-group>. <article-title>Snake three-finger &#x03B1;-neurotoxins and nicotinic acetylcholine receptors: molecules, mechanisms and medicine</article-title>. <source>Biochem Pharmacol</source>. (<year>2020</year>) <volume>181</volume>:<fpage>114168</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bcp.2020.114168</pub-id>, PMID: <pub-id pub-id-type="pmid">32710970</pub-id></citation></ref>
<ref id="ref4"><label>4.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Okuda</surname> <given-names>J</given-names></name> <name><surname>Kiyokawa</surname> <given-names>R</given-names></name></person-group>. <article-title>Snake as a symbol in medicine and pharmacy-a historical study</article-title>. <source>Yakushigaku Zasshi</source>. (<year>2000</year>) <volume>35</volume>:<fpage>25</fpage>&#x2013;<lpage>40</lpage>. PMID: <pub-id pub-id-type="pmid">11640204</pub-id></citation></ref>
<ref id="ref5"><label>5.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bordon</surname> <given-names>KDCF</given-names></name> <name><surname>Cologna</surname> <given-names>CT</given-names></name> <name><surname>Fornari-Baldo</surname> <given-names>EC</given-names></name> <name><surname>Pinheiro-Junior</surname> <given-names>EL</given-names></name> <name><surname>Cerni</surname> <given-names>FA</given-names></name> <name><surname>Amorim</surname> <given-names>FG</given-names></name> <etal/></person-group>. <article-title>From animal poisons and venoms to medicines: achievements, challenges and perspectives in drug discovery</article-title>. <source>Front Pharmacol</source>. (<year>2020</year>) <volume>11</volume>:<fpage>1132</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fphar.2020.01132</pub-id>, PMID: <pub-id pub-id-type="pmid">32848750</pub-id></citation></ref>
<ref id="ref6"><label>6.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pearson</surname> <given-names>WR</given-names></name></person-group>. <article-title>Finding protein and nucleotide similarities with FASTA</article-title>. <source>Curr Protoc Bioinformatics</source>. (<year>2016</year>) <volume>53</volume>:<fpage>3.9.1</fpage>&#x2013;<lpage>3.9.25</lpage>. doi: <pub-id pub-id-type="doi">10.1002/0471250953.bi0309s53</pub-id></citation></ref>
<ref id="ref7"><label>7.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zou</surname> <given-names>Q</given-names></name> <name><surname>Hu</surname> <given-names>Q</given-names></name> <name><surname>Guo</surname> <given-names>M</given-names></name> <name><surname>Wang</surname> <given-names>G</given-names></name></person-group>. <article-title>HAlign: fast multiple similar DNA/RNA sequence alignment based on the Centre star strategy</article-title>. <source>Bioinformatics</source>. (<year>2015</year>) <volume>31</volume>:<fpage>2475</fpage>&#x2013;<lpage>81</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btv177</pub-id>, PMID: <pub-id pub-id-type="pmid">25812743</pub-id></citation></ref>
<ref id="ref8"><label>8.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wan</surname> <given-names>S</given-names></name> <name><surname>Zou</surname> <given-names>Q</given-names></name></person-group>. <article-title>HAlign-II: efficient ultra-large multiple sequence alignment and phylogenetic tree reconstruction with distributed and parallel computing</article-title>. <source>Algorithms Mol Biol</source>. (<year>2017</year>) <volume>12</volume>:<fpage>25</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s13015-017-0116-x</pub-id>, PMID: <pub-id pub-id-type="pmid">29026435</pub-id></citation></ref>
<ref id="ref9"><label>9.</label><citation citation-type="book"><person-group person-group-type="author"><name><surname>Madden</surname> <given-names>T</given-names></name></person-group>. <source>The BLAST sequence analysis tool, the NCBI handbook</source>. <edition>2nd</edition> ed. <publisher-loc>Bethesda, MD</publisher-loc>: <publisher-name>National Center for Biotechnology Information (US)</publisher-name> (<year>2013</year>).</citation></ref>
<ref id="ref10"><label>10.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zulfiqar</surname> <given-names>H</given-names></name> <name><surname>Sun</surname> <given-names>Z-J</given-names></name> <name><surname>Huang</surname> <given-names>Q-L</given-names></name> <name><surname>Yuan</surname> <given-names>S-S</given-names></name> <name><surname>Lv</surname> <given-names>H</given-names></name> <name><surname>Dao</surname> <given-names>F-Y</given-names></name> <etal/></person-group>. <article-title>Deep-4mCW2V: a sequence-based predictor to identify N4-methylcytosine sites in <italic>Escherichia coli</italic></article-title>. <source>Methods</source>. (<year>2021</year>) <volume>203</volume>:<fpage>558</fpage>&#x2013;<lpage>63</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ymeth.2021.07.011</pub-id></citation></ref>
<ref id="ref11"><label>11.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>H</given-names></name> <name><surname>Zhao</surname> <given-names>YW</given-names></name> <name><surname>Zou</surname> <given-names>P</given-names></name> <name><surname>Zhang</surname> <given-names>CM</given-names></name> <name><surname>Chen</surname> <given-names>R</given-names></name> <name><surname>Huang</surname> <given-names>P</given-names></name> <etal/></person-group>. <article-title>HBPred: a tool to identify growth hormone-binding proteins</article-title>. <source>Int J Biol Sci</source>. (<year>2018</year>) <volume>14</volume>:<fpage>957</fpage>&#x2013;<lpage>64</lpage>. doi: <pub-id pub-id-type="doi">10.7150/ijbs.24174</pub-id>, PMID: <pub-id pub-id-type="pmid">29989085</pub-id></citation></ref>
<ref id="ref12"><label>12.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zulfiqar</surname> <given-names>H</given-names></name> <name><surname>Yuan</surname> <given-names>S-S</given-names></name> <name><surname>Huang</surname> <given-names>Q-L</given-names></name> <name><surname>Sun</surname> <given-names>Z-J</given-names></name> <name><surname>Dao</surname> <given-names>F-Y</given-names></name> <name><surname>Yu</surname> <given-names>X-L</given-names></name> <etal/></person-group>. <article-title>Identification of cyclin protein using gradient boost decision tree algorithm</article-title>. <source>Comput Struct Biotechnol J</source>. (<year>2021</year>) <volume>19</volume>:<fpage>4123</fpage>&#x2013;<lpage>31</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.csbj.2021.07.013</pub-id>, PMID: <pub-id pub-id-type="pmid">34527186</pub-id></citation></ref>
<ref id="ref13"><label>13.</label><citation citation-type="journal"><person-group person-group-type="author"><collab id="coll1">UniProt Consortium</collab></person-group>. <article-title>Uni Prot: a worldwide hub of protein knowledge</article-title>. <source>Nucleic Acids Res</source>. (<year>2019</year>) <volume>47</volume>:<fpage>D506</fpage>&#x2013;<lpage>15</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gky1049</pub-id>, PMID: <pub-id pub-id-type="pmid">30395287</pub-id></citation></ref>
<ref id="ref14"><label>14.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>O'Leary</surname> <given-names>NA</given-names></name> <name><surname>Wright</surname> <given-names>MW</given-names></name> <name><surname>Brister</surname> <given-names>JR</given-names></name> <name><surname>Ciufo</surname> <given-names>S</given-names></name> <name><surname>Haddad</surname> <given-names>D</given-names></name> <name><surname>McVeigh</surname> <given-names>R</given-names></name> <etal/></person-group>. <article-title>Reference sequence (ref Seq) database at NCBI: current status, taxonomic expansion, and functional annotation</article-title>. <source>Nucleic Acids Res</source>. (<year>2016</year>) <volume>44</volume>:<fpage>D733</fpage>&#x2013;<lpage>45</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkv1189</pub-id>, PMID: <pub-id pub-id-type="pmid">26553804</pub-id></citation></ref>
<ref id="ref15"><label>15.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fu</surname> <given-names>L</given-names></name> <name><surname>Niu</surname> <given-names>B</given-names></name> <name><surname>Zhu</surname> <given-names>Z</given-names></name> <name><surname>Wu</surname> <given-names>S</given-names></name> <name><surname>Li</surname> <given-names>W</given-names></name></person-group>. <article-title>CD-HIT: accelerated for clustering the next-generation sequencing data</article-title>. <source>Bioinformatics</source>. (<year>2012</year>) <volume>28</volume>:<fpage>3150</fpage>&#x2013;<lpage>2</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/bts565</pub-id>, PMID: <pub-id pub-id-type="pmid">23060610</pub-id></citation></ref>
<ref id="ref16"><label>16.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lv</surname> <given-names>H</given-names></name> <name><surname>Dao</surname> <given-names>F-Y</given-names></name> <name><surname>Zulfiqar</surname> <given-names>H</given-names></name> <name><surname>Lin</surname> <given-names>H</given-names></name></person-group>. <article-title>Deep IPs: comprehensive assessment and computational identification of phosphorylation sites of SARS-CoV-2 infection using a deep learning-based approach</article-title>. <source>Brief Bioinform</source>. (<year>2021</year>) <volume>22</volume>:<fpage>244</fpage>. doi: <pub-id pub-id-type="doi">10.1093/bib/bbab244</pub-id>, PMID: <pub-id pub-id-type="pmid">34184738</pub-id></citation></ref>
<ref id="ref17"><label>17.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Deng</surname> <given-names>M</given-names></name> <name><surname>Yu</surname> <given-names>C</given-names></name> <name><surname>Liang</surname> <given-names>Q</given-names></name> <name><surname>He</surname> <given-names>RL</given-names></name> <name><surname>Yau</surname> <given-names>SS-T</given-names></name></person-group>. <article-title>A novel method of characterizing genetic sequences: genome space with biological distance and applications</article-title>. <source>PloS One</source>. (<year>2011</year>) <volume>6</volume>:<fpage>e17293</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0017293</pub-id>, PMID: <pub-id pub-id-type="pmid">21399690</pub-id></citation></ref>
<ref id="ref18"><label>18.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>D</given-names></name> <name><surname>Chen</surname> <given-names>H-D</given-names></name> <name><surname>Zulfiqar</surname> <given-names>H</given-names></name> <name><surname>Yuan</surname> <given-names>S-S</given-names></name> <name><surname>Huang</surname> <given-names>Q-L</given-names></name> <name><surname>Zhang</surname> <given-names>Z-Y</given-names></name> <etal/></person-group>. <article-title>iBLP: an XGBoost-based predictor for identifying bioluminescent proteins</article-title>. <source>Comput Math Methods Med</source>. (<year>2021</year>) <volume>2021</volume>:<fpage>1</fpage>&#x2013;<lpage>15</lpage>. doi: <pub-id pub-id-type="doi">10.1155/2021/6664362</pub-id></citation></ref>
<ref id="ref19"><label>19.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zou</surname> <given-names>Q</given-names></name> <name><surname>Xing</surname> <given-names>P</given-names></name> <name><surname>Wei</surname> <given-names>L</given-names></name> <name><surname>Liu</surname> <given-names>B</given-names></name></person-group>. <article-title>Gene 2vec: gene subsequence embedding for prediction of mammalian N6-Methyladenosine sites from mRNA</article-title>. <source>RNA</source>. (<year>2019</year>) <volume>25</volume>:<fpage>205</fpage>&#x2013;<lpage>18</lpage>. doi: <pub-id pub-id-type="doi">10.1261/rna.069112.118</pub-id>, PMID: <pub-id pub-id-type="pmid">30425123</pub-id></citation></ref>
<ref id="ref20"><label>20.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Charoenkwan</surname> <given-names>P</given-names></name> <name><surname>Nantasenamat</surname> <given-names>C</given-names></name> <name><surname>Hasan</surname> <given-names>MM</given-names></name> <name><surname>Manavalan</surname> <given-names>B</given-names></name> <name><surname>Shoombuatong</surname> <given-names>W</given-names></name></person-group>. <article-title>BERT4Bitter: a bidirectional encoder representations from transformers (BERT)-based model for improving the prediction of bitter peptides</article-title>. <source>Bioinformatics</source>. (<year>2021</year>) <volume>37</volume>:<fpage>2556</fpage>&#x2013;<lpage>62</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btab133</pub-id>, PMID: <pub-id pub-id-type="pmid">33638635</pub-id></citation></ref>
<ref id="ref21"><label>21.</label><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Deho</surname> <given-names>B.O.</given-names></name> <name><surname>Agangiba</surname> <given-names>A.W.</given-names></name> <name><surname>Aryeh</surname> <given-names>L.F.</given-names></name> <name><surname>Ansah</surname> <given-names>A.J.</given-names></name></person-group>, <article-title>Sentiment analysis with word embedding</article-title>, <conf-name>2018 IEEE 7th international conference on Adaptive Science &#x0026; Technology (ICAST)</conf-name>, <conf-loc>Accra, Ghana</conf-loc>. (<year>2018</year>), pp. <fpage>1</fpage>&#x2013;<lpage>4</lpage>.</citation></ref>
<ref id="ref22"><label>22.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>McCormick</surname> <given-names>C.</given-names></name></person-group> (<year>2016</year>). Word 2vec tutorial-the skip-gram model. Available at: <ext-link xlink:href="http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model" ext-link-type="uri">http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model</ext-link></citation></ref>
<ref id="ref23"><label>23.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Church</surname> <given-names>KW</given-names></name></person-group>. <article-title>Word2Vec</article-title>. <source>Nat Lang Eng</source>. (<year>2017</year>) <volume>23</volume>:<fpage>155</fpage>&#x2013;<lpage>62</lpage>. doi: <pub-id pub-id-type="doi">10.1017/S1351324916000334</pub-id></citation></ref>
<ref id="ref24"><label>24.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zulfiqar</surname> <given-names>H</given-names></name> <name><surname>Dao</surname> <given-names>F-Y</given-names></name> <name><surname>Lv</surname> <given-names>H</given-names></name> <name><surname>Yang</surname> <given-names>H</given-names></name> <name><surname>Zhou</surname> <given-names>P</given-names></name> <name><surname>Chen</surname> <given-names>W</given-names></name> <etal/></person-group>. <article-title>Identification of potential inhibitors against SARS-Cov-2 using computational drug repurposing study</article-title>. <source>Curr Bioinforma</source>. (<year>2021</year>) <volume>16</volume>:<fpage>1320</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.2174/1574893616666210726155903</pub-id></citation></ref>
<ref id="ref25"><label>25.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dao</surname> <given-names>F-Y</given-names></name> <name><surname>Lv</surname> <given-names>H</given-names></name> <name><surname>Zulfiqar</surname> <given-names>H</given-names></name> <name><surname>Yang</surname> <given-names>H</given-names></name> <name><surname>Su</surname> <given-names>W</given-names></name> <name><surname>Gao</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>A computational platform to identify origins of replication sites in eukaryotes</article-title>. <source>Brief Bioinform</source>. (<year>2021</year>) <volume>22</volume>:<fpage>1940</fpage>&#x2013;<lpage>50</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bib/bbaa017</pub-id>, PMID: <pub-id pub-id-type="pmid">32065211</pub-id></citation></ref>
<ref id="ref26"><label>26.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zou</surname> <given-names>X</given-names></name> <name><surname>Ren</surname> <given-names>L</given-names></name> <name><surname>Cai</surname> <given-names>P</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Ding</surname> <given-names>H</given-names></name> <name><surname>Deng</surname> <given-names>K</given-names></name> <etal/></person-group>. <article-title>Accurately identifying hemagglutinin using sequence information and machine learning methods</article-title>. <source>Front Med</source>. (<year>2023</year>) <volume>10</volume>:<fpage>1281880</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fmed.2023.1281880</pub-id>, PMID: <pub-id pub-id-type="pmid">38020152</pub-id></citation></ref>
<ref id="ref27"><label>27.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>W</given-names></name> <name><surname>Yuan</surname> <given-names>SS</given-names></name> <name><surname>Li</surname> <given-names>J</given-names></name> <name><surname>Huang</surname> <given-names>CB</given-names></name> <name><surname>Lin</surname> <given-names>H</given-names></name> <name><surname>Liao</surname> <given-names>B</given-names></name></person-group>. <article-title>A first computational frame for recognizing heparin-binding protein</article-title>. <source>Diagnostics</source>. (<year>2023</year>) <volume>13</volume>:<fpage>2465</fpage>. doi: <pub-id pub-id-type="doi">10.3390/diagnostics13142465</pub-id></citation></ref>
<ref id="ref28"><label>28.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dao</surname> <given-names>FY</given-names></name> <name><surname>Lv</surname> <given-names>H</given-names></name> <name><surname>Wang</surname> <given-names>F</given-names></name> <name><surname>Feng</surname> <given-names>CQ</given-names></name> <name><surname>Ding</surname> <given-names>H</given-names></name> <name><surname>Chen</surname> <given-names>W</given-names></name> <etal/></person-group>. <article-title>Identify origin of replication in <italic>Saccharomyces cerevisiae</italic> using two-step feature selection technique</article-title>. <source>Bioinformatics</source>. (<year>2019</year>) <volume>35</volume>:<fpage>2075</fpage>&#x2013;<lpage>83</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/bty943</pub-id>, PMID: <pub-id pub-id-type="pmid">30428009</pub-id></citation></ref>
<ref id="ref29"><label>29.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>De Jay</surname> <given-names>N</given-names></name> <name><surname>Papillon-Cavanagh</surname> <given-names>S</given-names></name> <name><surname>Olsen</surname> <given-names>C</given-names></name> <name><surname>El-Hachem</surname> <given-names>N</given-names></name> <name><surname>Bontempi</surname> <given-names>G</given-names></name> <name><surname>Haibe-Kains</surname> <given-names>B</given-names></name></person-group>. <article-title>mRMRe: an R package for parallelized mRMR ensemble feature selection</article-title>. <source>Bioinformatics</source>. (<year>2013</year>) <volume>29</volume>:<fpage>2365</fpage>&#x2013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btt383</pub-id>, PMID: <pub-id pub-id-type="pmid">23825369</pub-id></citation></ref>
<ref id="ref30"><label>30.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>LeCun</surname> <given-names>Y</given-names></name> <name><surname>Bottou</surname> <given-names>L</given-names></name> <name><surname>Bengio</surname> <given-names>Y</given-names></name> <name><surname>Haffner</surname> <given-names>P</given-names></name></person-group>. <article-title>Gradient-based learning applied to document recognition</article-title>. <source>Proc IEEE</source>. (<year>1998</year>) <volume>86</volume>:<fpage>2278</fpage>&#x2013;<lpage>324</lpage>. doi: <pub-id pub-id-type="doi">10.1109/5.726791</pub-id></citation></ref>
<ref id="ref31"><label>31.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Niu</surname> <given-names>M</given-names></name> <name><surname>Lin</surname> <given-names>Y</given-names></name> <name><surname>Zou</surname> <given-names>Q</given-names></name></person-group>. <article-title>sgRNACNN: identifying sgRNA on-target activity in four crops using ensembles of convolutional neural networks</article-title>. <source>Plant Mol Biol</source>. (<year>2021</year>) <volume>105</volume>:<fpage>483</fpage>&#x2013;<lpage>95</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11103-020-01102-y</pub-id>, PMID: <pub-id pub-id-type="pmid">33385273</pub-id></citation></ref>
<ref id="ref32"><label>32.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kwon</surname> <given-names>Y-H</given-names></name> <name><surname>Shin</surname> <given-names>S-B</given-names></name> <name><surname>Kim</surname> <given-names>S-D</given-names></name></person-group>. <article-title>Electroencephalography based fusion two-dimensional (2D)-convolution neural networks (CNN) model for emotion recognition system</article-title>. <source>Sensors</source>. (<year>2018</year>) <volume>18</volume>:<fpage>1383</fpage>. doi: <pub-id pub-id-type="doi">10.3390/s18051383</pub-id>, PMID: <pub-id pub-id-type="pmid">29710869</pub-id></citation></ref>
<ref id="ref33"><label>33.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lv</surname> <given-names>H</given-names></name> <name><surname>Dao</surname> <given-names>F-Y</given-names></name> <name><surname>Zulfiqar</surname> <given-names>H</given-names></name> <name><surname>Su</surname> <given-names>W</given-names></name> <name><surname>Ding</surname> <given-names>H</given-names></name> <name><surname>Liu</surname> <given-names>L</given-names></name> <etal/></person-group>. <article-title>A sequence-based deep learning approach to predict CTCF-mediated chromatin loop</article-title>. <source>Brief Bioinform</source>. (<year>2021</year>) <volume>22</volume>:<fpage>bbab031</fpage>. doi: <pub-id pub-id-type="doi">10.1093/bib/bbab031</pub-id>, PMID: <pub-id pub-id-type="pmid">33634313</pub-id></citation></ref>
<ref id="ref34"><label>34.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Chollet</surname> <given-names>F.</given-names></name></person-group>. (<year>2015</year>). Keras: Deep learning library for theano and tensorflow. Available at: <ext-link xlink:href="https://keras.io/k" ext-link-type="uri">https://keras.io/k</ext-link></citation></ref>
<ref id="ref35"><label>35.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cao</surname> <given-names>R</given-names></name> <name><surname>Freitas</surname> <given-names>C</given-names></name> <name><surname>Chan</surname> <given-names>L</given-names></name> <name><surname>Sun</surname> <given-names>M</given-names></name> <name><surname>Jiang</surname> <given-names>H</given-names></name> <name><surname>Chen</surname> <given-names>Z</given-names></name></person-group>. <article-title>ProLanGO: protein function prediction using neural machine translation based on a recurrent neural network</article-title>. <source>Molecules</source>. (<year>2017</year>) <volume>22</volume>:<fpage>1732</fpage>. doi: <pub-id pub-id-type="doi">10.3390/molecules22101732</pub-id>, PMID: <pub-id pub-id-type="pmid">29039790</pub-id></citation></ref>
<ref id="ref36"><label>36.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Abraham</surname> <given-names>A</given-names></name> <name><surname>Pedregosa</surname> <given-names>F</given-names></name> <name><surname>Eickenberg</surname> <given-names>M</given-names></name> <name><surname>Gervais</surname> <given-names>P</given-names></name> <name><surname>Mueller</surname> <given-names>A</given-names></name> <name><surname>Kossaifi</surname> <given-names>J</given-names></name> <etal/></person-group>. <article-title>Machine learning for neuroimaging with scikit-learn</article-title>. <source>Front Neuroinform</source>. (<year>2014</year>) <volume>8</volume>:<fpage>14</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fninf.2014.00014</pub-id></citation></ref>
</ref-list>
</back>
</article>