<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Cell Dev. Biol.</journal-id>
<journal-title>Frontiers in Cell and Developmental Biology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Cell Dev. Biol.</abbrev-journal-title>
<issn pub-type="epub">2296-634X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fcell.2020.594587</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Cell and Developmental Biology</subject>
<subj-group>
<subject>Methods</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>DeepCSO: A Deep-Learning Network Approach to Predicting Cysteine S-Sulphenylation Sites</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Lyu</surname> <given-names>Xiaru</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x2020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1089909/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname> <given-names>Shuhao</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x2020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1147628/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Jiang</surname> <given-names>Chunyang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>He</surname> <given-names>Ningning</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/826502/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Chen</surname> <given-names>Zhen</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/987016/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zou</surname> <given-names>Yang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1023176/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Li</surname> <given-names>Lei</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1021282/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>School of Basic Medicine, Qingdao University</institution>, <addr-line>Qingdao</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>College of Life Sciences, Qingdao University</institution>, <addr-line>Qingdao</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Collaborative Innovation Center of Henan Grain Crops, Henan Agricultural University</institution>, <addr-line>Zhengzhou</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>Key Laboratory of Rice Biology in Henan Province, Henan Agricultural University</institution>, <addr-line>Zhengzhou</addr-line>, <country>China</country></aff>
<aff id="aff5"><sup>5</sup><institution>School of Data Science and Software Engineering, Qingdao University</institution>, <addr-line>Qingdao</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Jiangning Song, Monash University, Australia</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Shaoping Shi, Nanchang University, China; Zexian Liu, Sun Yat-sen University Cancer Center (SYSUCC), China</p></fn>
<corresp id="c001">&#x002A;Correspondence: Yang Zou, <email>yangzou306@gmail.com</email></corresp>
<corresp id="c002">Lei Li, <email>leili@qdu.edu.cn</email></corresp>
<fn fn-type="other" id="fn002"><p><sup>&#x2020;</sup>These authors share first authorship</p></fn>
<fn fn-type="other" id="fn004"><p>This article was submitted to Cellular Biochemistry, a section of the journal Frontiers in Cell and Developmental Biology</p></fn>
</author-notes>
<pub-date pub-type="epub">
<day>01</day>
<month>12</month>
<year>2020</year>
</pub-date>
<pub-date pub-type="collection">
<year>2020</year>
</pub-date>
<volume>8</volume>
<elocation-id>594587</elocation-id>
<history>
<date date-type="received">
<day>13</day>
<month>08</month>
<year>2020</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>11</month>
<year>2020</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2020 Lyu, Li, Jiang, He, Chen, Zou and Li.</copyright-statement>
<copyright-year>2020</copyright-year>
<copyright-holder>Lyu, Li, Jiang, He, Chen, Zou and Li</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Cysteine S-sulphenylation (CSO), as a novel post-translational modification (PTM), has emerged as a potential mechanism to regulate protein functions and affect signal networks. Because of its functional significance, several prediction approaches have been developed. Nevertheless, they are based on a limited dataset from <italic>Homo sapiens</italic> and there is a lack of prediction tools for the CSO sites of other species. Recently, this modification has been investigated at the proteomics scale for a few species and the number of identified CSO sites has significantly increased. Thus, it is essential to explore the characteristics of this modification across different species and construct prediction models with better performances based on the enlarged dataset. In this study, we constructed several classifiers and found that the long short-term memory model with the word-embedding encoding approach, dubbed LSTM<sub><italic>WE</italic></sub>, performs favorably to the traditional machine-learning models and other deep-learning models across different species, in terms of cross-validation and independent test. The area under the receiver operating characteristic (ROC) curve for LSTM<sub><italic>WE</italic></sub> ranged from 0.82 to 0.85 for different organisms, which was superior to the reported CSO predictors. Moreover, we developed the general model based on the integrated data from different species and it showed great universality and effectiveness. We provided the on-line prediction service called DeepCSO that included both species-specific and general models, which is accessible through <ext-link ext-link-type="uri" xlink:href="http://www.bioinfogo.org/DeepCSO">http://www.bioinfogo.org/DeepCSO</ext-link>.</p>
</abstract>
<kwd-group>
<kwd>machine learning</kwd>
<kwd>modification site prediction</kwd>
<kwd>deep learning</kwd>
<kwd>Cysteine S-sulphenylation</kwd>
<kwd>post-translational modification</kwd>
</kwd-group>
<contract-sponsor id="cn001">National Natural Science Foundation of China<named-content content-type="fundref-id">10.13039/501100001809</named-content></contract-sponsor>
<counts>
<fig-count count="7"/>
<table-count count="3"/>
<equation-count count="6"/>
<ref-count count="39"/>
<page-count count="12"/>
<word-count count="0"/>
</counts>
</article-meta>
</front>
<body>
<sec id="S1">
<title>Introduction</title>
<p>Protein Cysteine S-sulphenylation (CSO) is the reversible oxidation of protein cysteinyl thiols to suphenic acids. S-sulphenylation functions as an intermediate on the path toward other redox modifications, such as disulfide formation, S-glutathionylation, and overoxidation to sulfinic and sulfonic acids (<xref ref-type="bibr" rid="B24">Paulsen and Carroll, 2013</xref>; <xref ref-type="bibr" rid="B17">Huang J.J et al., 2018</xref>). This modification has been reported to influence protein functions, regulate signal transduction and affect cell cycle (<xref ref-type="bibr" rid="B28">Van Breusegem and Dat, 2006</xref>; <xref ref-type="bibr" rid="B22">Men and Wang, 2007</xref>; <xref ref-type="bibr" rid="B24">Paulsen and Carroll, 2013</xref>; <xref ref-type="bibr" rid="B15">Hourihan et al., 2016</xref>; <xref ref-type="bibr" rid="B11">Choudhury et al., 2017</xref>; <xref ref-type="bibr" rid="B23">Mhamdi and Van Breusegem, 2018</xref>). So far, thousands of CSO sites have been identified from different species including the mammal <italic>Homo sapiens</italic> and the plant organism <italic>Arabidopsis thaliana</italic> using the chemoproteomics approach (<xref ref-type="bibr" rid="B36">Yang et al., 2014</xref>; <xref ref-type="bibr" rid="B21">Li et al., 2016</xref>; <xref ref-type="bibr" rid="B13">Gupta et al., 2017</xref>; <xref ref-type="bibr" rid="B2">Akter et al., 2018</xref>; <xref ref-type="bibr" rid="B16">Huang et al., 2019</xref>; summarized in <xref ref-type="supplementary-material" rid="TS1">Supplementary Table 1</xref>). Nevertheless, the CSO site detection remains a major methodological issue due to low abundance and dynamic level of CSO-containing proteins <italic>in vivo</italic>. In contrast to the time-consuming and expensive experimental approaches, computational methods for predicting CSO sites have attracted considerable attention because of their convenience and efficiency.</p>
<p>Several computational methods have been developed for the prediction of CSO sites, mainly based on a single human dataset containing 1105 identified CSO sites (<xref ref-type="bibr" rid="B36">Yang et al., 2014</xref>). They include MDD-SOH (<xref ref-type="bibr" rid="B4">Bui et al., 2016a</xref>), iSulf-Cys (<xref ref-type="bibr" rid="B35">Xu et al., 2016</xref>), SOHSite (<xref ref-type="bibr" rid="B5">Bui et al., 2016b</xref>), PRESS (<xref ref-type="bibr" rid="B25">Sakka et al., 2016</xref>), Sulf_FSVM (<xref ref-type="bibr" rid="B20">Ju and Wang, 2018</xref>), S-SulfPred (<xref ref-type="bibr" rid="B19">Jia and Zuo, 2017</xref>), Fu-SulfPred (<xref ref-type="bibr" rid="B32">Wang et al., 2019</xref>), SulCysSite (<xref ref-type="bibr" rid="B14">Hasan et al., 2017</xref>), SOHPRED (<xref ref-type="bibr" rid="B33">Wang et al., 2016</xref>), and PredCSO (<xref ref-type="bibr" rid="B12">Deng et al., 2018</xref>). Out of them, two are based on protein three-dimensional structures, in which PRESS relies on four different protein structural properties (<xref ref-type="bibr" rid="B25">Sakka et al., 2016</xref>) whereas PredCSO is an ensemble model that combines bootstrap resampling, gradient tree boosting and majority voting with the 21 features refined out using a two-step feature selection procedure (<xref ref-type="bibr" rid="B12">Deng et al., 2018</xref>). The advantage of both classifiers is the inclusion of accurate structural features but their drawback is the limitation of the available structures. The rest classifiers are based on protein sequences. They can be classified into two clusters in terms of model complexity. The first cluster contains four relatively simple models. ISulf-Cys is an SVM (Support Vector Machine)-based classifier with the integration of three features including binary, PSAAP, and AAindex (<xref ref-type="bibr" rid="B35">Xu et al., 2016</xref>). SOHSite is an SVM-based classifier with the combined features of position-specific scoring matrix (PSSM) and AAindex (<xref ref-type="bibr" rid="B5">Bui et al., 2016b</xref>). SulCysSite is an RF (Random Forest)-based classifier with the integration of multiple features (<xref ref-type="bibr" rid="B14">Hasan et al., 2017</xref>) and Sulf_FSVM is an fuzzy SVM classifier using mRMR feature selection from three kinds of features (<xref ref-type="bibr" rid="B20">Ju and Wang, 2018</xref>). The second cluster includes four relatively complex models. MDD-SOH contains two-layered SVMs trained with MDDLogo-identified substrate motifs (<xref ref-type="bibr" rid="B4">Bui et al., 2016a</xref>). S-SulfPred is an SVM-based classifier with the balanced training dataset established using one-sided selection undersampling for negative samples and synthetic minority oversampling for positive samples (<xref ref-type="bibr" rid="B19">Jia and Zuo, 2017</xref>). Fu-SulfPred contains two layers of forest-based structure with the reconstruction of training datasets for data balance (<xref ref-type="bibr" rid="B32">Wang et al., 2019</xref>). SOHPRED was built by integrating four complementary predictors (i.e., a naive Bayesian predictor, an RF predictor, and two SVM predictors), each of which was associated with different training features (<xref ref-type="bibr" rid="B33">Wang et al., 2016</xref>). In summary, the characteristics of these sequence-based models are the combination of distinct types of features, or/and the balancing of training data, or/and the integration of different classifiers. Although the developed classifiers have made contribution to the prediction of CSO sites, most of them are currently inaccessible. Moreover, there is a lack of prediction tools for the CSO sites of multiple species. With the growing number of CSO sites verified, it is essential to develop species-specific prediction models with high accuracy or even a general model.</p>
<p>Compared to traditional machine-learning (ML) algorithms (e.g., SVM and RF) used in the prediction approaches described above, the deep-learning (DL) architecture is a promising ML algorithm. In the DL algorithm, a suitable representation of the input data can be transformed into highly abstract features through propagating the whole model. Superposition of hidden layers in neural networks can increase the ability of feature extraction, resulting in a more accurate interpretation of latent data patterns. Indeed, several frequently utilized DL models have been recently applied in the field of Bioinformatics, especially the prediction of post-translational modification (PTM) sites. For instance, deep neural networks were utilized for the prediction of protein nitration and nitrosylation sites (<xref ref-type="bibr" rid="B34">Xie et al., 2018</xref>), recurrent neural networks (RNNs) were employed for the prediction of lysine Malonylation sites (<xref ref-type="bibr" rid="B8">Chen et al., 2018b</xref>) and convolutional neural networks (CNNs) were used for the prediction of phosphorylation sites and crotonylation sites (<xref ref-type="bibr" rid="B31">Wang et al., 2017</xref>; <xref ref-type="bibr" rid="B39">Zhao et al., 2020</xref>). Deep learning algorithms have demonstrated their advantages in the application of large data sets, compared to the traditional ML methodology (<xref ref-type="bibr" rid="B8">Chen et al., 2018b</xref>). Because of this, the introduction of DL algorithms into the prediction of CSO sites would be a promising move to provide reliable candidates for further experimental consideration.</p>
<p>In this study, we constructed a number <italic>in silico</italic> approaches for the prediction of the CSO sites for <italic>H. Sapiens</italic> and <italic>A. thaliana</italic>. These approaches included the RF and SVM algorithms, one-dimensional CNN (1D-CNN), two-dimensional CNN (2D-CNN) and long short-term memory (LSTM) that is an RNN type. The LSTM model with the word-embedding encoding approach, called LSTM<sub><italic>WE</italic></sub>, compared favorably to the rest approaches with AUC as 0.82 and 0.85 in human and Arabidopsis in terms of cross-validation. Moreover, LSTM<sub><italic>WE</italic></sub> trained using the data from one organism achieved outstanding performance in predicting CSO sites of other organisms (e.g., AUC = 0.80 for the prediction of Arabidopsis CSO sites using the human model), suggesting that CSO is highly conserved. Therefore, we constructed a general CSO prediction model. These models will facilitate the discovery of new CSO sites and thus will contribute to the understanding of roles and functions of CSO in diverse cellular processes.</p>
</sec>
<sec id="S2" sec-type="materials|methods">
<title>Materials and Methods</title>
<sec id="S2.SS1">
<title>Data Collection and Preprocessing</title>
<p>The experimentally identified CSO sites were derived from two different organisms including <italic>H. Sapiens</italic> and <italic>A. thaliana</italic> (<xref ref-type="bibr" rid="B36">Yang et al., 2014</xref>; <xref ref-type="bibr" rid="B21">Li et al., 2016</xref>; <xref ref-type="bibr" rid="B13">Gupta et al., 2017</xref>; <xref ref-type="bibr" rid="B2">Akter et al., 2018</xref>; <xref ref-type="bibr" rid="B16">Huang et al., 2019</xref>). The data of the species were pre-processed and the related procedure was exemplified using the <italic>A. thaliana</italic> data, as listed below (<xref ref-type="fig" rid="F1">Figure 1A</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption><p>The flowchart of the dataset process for <italic>A. thaliana</italic> <bold>(A)</bold> and <italic>H. sapiens</italic> <bold>(B)</bold>.</p></caption>
<graphic xlink:href="fcell-08-594587-g001.tif"/>
</fig>
<p>We mapped 1537 Arabidopsis CSO sites (<xref ref-type="bibr" rid="B16">Huang et al., 2019</xref>) to the UniprotKB database (<xref ref-type="bibr" rid="B26">UniProt Comstortium, 2011</xref>) and 1535 sites from 1130 proteins were retained as positive sites. The rest 8819 Cysteine residues in the same proteins were defined as negative sites. Moreover, we truncated these protein sequences into 35-residue segments with the Cysteine located at the center and the positive/negative sites correspond to positive/negative segments, respectively. It should be noted that if the central Cysteine was located around the N-terminus or C-terminus of a protein sequence, the gap symbol &#x201C;-&#x201D; was added to the corresponding positions to ensure that the segment had the same length. The segment length was optimized as a hyper-parameter in the Bayesian optimization method (see details in Section of &#x201C;Optimization Methods for Hyper-Parameters&#x201D;) and finally determined as 33. Furthermore, to reduce the potential influence of the segments with high similarity on the performance of the models to be constructed, we set the identity of any two sequences with less than 40%, referring to previous studies (<xref ref-type="bibr" rid="B4">Bui et al., 2016a</xref>; <xref ref-type="bibr" rid="B33">Wang et al., 2016</xref>; <xref ref-type="bibr" rid="B35">Xu et al., 2016</xref>). When the identity was &#x003E;40% between two positive segments or two negative segments, one was randomly removed. When the identity was &#x003E;40% between a positive segment and a negative segment, the positive was retained and the negative was discarded. As a result, 1380 positives and 7421 negatives were retained. Finally, we randomly separated the positive and negative segments into 11 groups of which 10 were used for 10-fold cross-validation (1254 positives and 6746 negatives) and the rest for an independent test (126 positives and 675 negatives) (<xref ref-type="fig" rid="F1">Figure 1A</xref>). Similarly, the cross-validation dataset for <italic>H. sapiens</italic> contained 16,249 samples (2507 positives and 13,742 negatives) and the independent test set comprised 1625 samples (251 positives and 1374 negatives) (<xref ref-type="fig" rid="F1">Figure 1B</xref>). These datasets are available at <ext-link ext-link-type="uri" xlink:href="http://www.bioinfogo.org/DeepCSO/download.php">http://www.bioinfogo.org/DeepCSO/download.php</ext-link>.</p>
</sec>
<sec id="S2.SS2">
<title>Feature Encoding Schemes</title>
<sec id="S2.SS2.SSS1">
<title>Numerical Representation for Amino Acids (NUM)</title>
<p>The NUM encoding approach maps each type of amino acid residue to an integer (<xref ref-type="bibr" rid="B38">Zhang Y. et al., 2019</xref>). Specifically, in the alphabet &#x201C;AVLIFWMPGSTCYNQHKRDE-&#x201D;, each letter from &#x201C;A&#x201D; to &#x201C;-&#x201D; is converted to the integers from 0 to 20 in turn. For example, the sequence &#x201C;VAMR&#x201D; is encoded as &#x201C;1,0,6,17.&#x201D; This encoding was used as the input of the first layer for both LSTM and 1D-CNN.</p>
</sec>
<sec id="S2.SS2.SSS2">
<title>Enhanced Amino Acid Composition</title>
<p>The enhanced amino acid composition (EAAC) encoding (<xref ref-type="bibr" rid="B8">Chen et al., 2018b</xref>,<xref ref-type="bibr" rid="B9">c</xref>, <xref ref-type="bibr" rid="B10">2020</xref>; <xref ref-type="bibr" rid="B18">Huang Y. et al., 2018</xref>) introduces a fixed-length sliding window based on the encoding of amino acid composition (AAC), which calculates the frequency of each type of amino acid in a protein or peptide sequence (<xref ref-type="bibr" rid="B3">Bhasin and Raghava, 2004</xref>). EAAC is calculated by continuously sliding a fixed-length sequence window (using the default value 5) from the N-terminus to the C-terminus of each peptide. The related formula is listed below:</p>
<disp-formula id="S2.Ex1">
<mml:math id="M1">
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>C</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x2026;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>Y</mml:mi>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="S2.E1">
<label>(1)</label>
<mml:math id="M2">
<mml:mrow>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x2026;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>d</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>o</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>&#x2062;</mml:mo>
<mml:mn>35</mml:mn>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where <italic>N(t, win)</italic> is the number of amino acid <italic>t</italic> in the sliding window <italic>win</italic>, and <italic>N(win)</italic> is the size of the sliding window <italic>win</italic>.</p>
</sec>
<sec id="S2.SS2.SSS3">
<title>Binary Encoding</title>
<p>In the binary encoding (<xref ref-type="bibr" rid="B9">Chen et al., 2018c</xref>), each amino acid is represented by a 21-dimensional binary vector that represents 20 amino acids and a complement &#x201C;-.&#x201D; The corresponding position is set as 1 and the rest position is set as 0. For example, the amino acid &#x201C;A&#x201D; is represented by &#x201C;100000000000000000000,&#x201D; &#x201C;V&#x201D; is represented by &#x201C;010000000000000000000,&#x201D; and the symbol &#x201C;-&#x201D; is represented by &#x201C;000000000000000000001,&#x201D; according to the alphabet &#x201C;AVLIFWMPGSTCYNQHKRDE-.&#x201D;</p>
</sec>
<sec id="S2.SS2.SSS4">
<title>AAindex Encoding</title>
<p>AAindex is a database of various indices representing distinct physicochemical and biochemical properties of amino acids and pairs of amino acids.<sup><xref ref-type="fn" rid="footnote1">1</xref></sup> In the 544 physicochemical properties, we retained 531 properties after the removal of properties with &#x201C;NA.&#x201D; We calculated the performance for each property using the RF classifier based on the 10-fold cross-validation dataset of arabidopsis. We selected the top 36 properties with AUC &#x003E; 0.7 (<xref ref-type="supplementary-material" rid="TS2">Supplementary Table 3</xref>).</p>
</sec>
<sec id="S2.SS2.SSS5">
<title>The Composition of k-Spaced Amino Acid Pairs</title>
<p>The composition of k-spaced amino acid pairs (CKSAAP) encoding contains the frequency of the amino acid pair of which both are separated by k-residues (k = 0, 1, 2, 3, 4, 5. We used the default value 5) (<xref ref-type="bibr" rid="B9">Chen et al., 2018c</xref>). This scheme represents the short- or long-range interactions amongst the residues along the sequence. The CKSAAP encoding with k = 0 is identical to the di-peptide composition.</p>
</sec>
<sec id="S2.SS2.SSS6">
<title>The Position-Specific Scoring Matrix</title>
<p>The PSSM encoding was derived from the previous publication (<xref ref-type="bibr" rid="B34">Xie et al., 2018</xref>). In brief, we calculated the statistical significance of the differences in the frequencies of symbol occurrence between the positive and negative samples using a two-sample <italic>t</italic>-test (<xref ref-type="bibr" rid="B27">Vacic et al., 2006</xref>). Accordingly, the PSSM of significant <italic>P</italic>-values were constructed. By integrating the PSSM of <italic>P</italic>-values with the frequency PSSM for positive and negative samples, we generated the final encoding PSSM that represented the conservation tendency of the positive or negative samples.</p>
</sec>
</sec>
<sec id="S2.SS3">
<title>Architecture of the Machine-Learning Models</title>
<sec id="S2.SS3.SSS1">
<title>The LSTM Model With the Word Embedding Encoding (LSTM<sub><italic>WE</italic></sub>)</title>
<p>LSTM<sub><italic>WE</italic></sub> contained five layers, listed as follows (<xref ref-type="fig" rid="F2">Figure 2</xref>).</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption><p>The LSTM<sub><italic>WE</italic></sub> architecture.</p></caption>
<graphic xlink:href="fcell-08-594587-g002.tif"/>
</fig>
<list list-type="simple">
<list-item>
<label>1.</label>
<p>Input layer. Each peptide segment is converted into an integer vector with the NUM encoding.</p>
</list-item>
<list-item>
<label>2.</label>
<p>Word Embedding (WE) layer. Each integer of the vector from the input layer is encoded into a four-dimension word vector for humans and a five-dimensional word vector for arabidopsis, respectively.</p>
</list-item>
<list-item>
<label>3.</label>
<p>LSTM layer. Each of the word vectors is input sequentially into the LSTM cell that contained 32 hidden neuron units.</p>
</list-item>
<list-item>
<label>4.</label>
<p>Dense layer. It contains a single dense sublayer that has 16 neurons with the ReLU activation function for humans and 32 neurons for arabidopsis, separately.</p>
</list-item>
<list-item>
<label>5.</label>
<p>Output layer. This layer has only one neuron activated by sigmoid function, outputting the probability of the CSO modification.</p>
</list-item>
</list>
</sec>
<sec id="S2.SS3.SSS2">
<title>The 1D-CNN Model With the Word Embedding Encoding</title>
<p>The 1D-CNN model with the word embedding encoding (1D-CNN<sub><italic>WE</italic></sub>) contains five layers (<xref ref-type="supplementary-material" rid="TS2">Supplementary Figure 1</xref>), where the first two layers and last one layer were as same as LSTM<sub><italic>WE</italic></sub>. The third layer was a 1D convolution layer with 22/20 filters for humans/arabidopsis and kernel size as nine. The fourth layer had a single dense sublayer with 16 neurons. The optimal hyper-parameter values were obtained using the Bayesian optimization algorithm.</p>
</sec>
<sec id="S2.SS3.SSS3">
<title>The 2D-CNN Model With the PSSM Feature</title>
<p>We took advantage of the 2D structure of an input image of CNN architecture and conveniently made similar 2D inputs of PSSM matrixes with the sizes of 20 &#x00D7; 20 s. The purpose of using the 2D-CNN model is to catch the hidden figures inside PSSM profiles. Next, PSSM profiles were connected to the 2D CNN design from the input layer through several hidden layers to the output layer. <xref ref-type="supplementary-material" rid="TS2">Supplementary Figure 2</xref> demonstrated the procedure of inputting a PSSM profile into the CNN model, then passing through a series of convolutional, non-linearity, pooling and fully connected layers and finally outputting the result. This model contained four hidden layers including one 2D convolutional layer, one pooling layer, one flattening layer, and one fully connected layer. Specifically, the first layer contained a PSSM profile on which we applied 2D convolutional operations with some existing parameters including 5 &#x00D7; 5 kernel size, 15 filters and 1 &#x00D7; 1 stride.</p>
</sec>
<sec id="S2.SS3.SSS4">
<title>The RF Algorithms With Different Features</title>
<p>The RF algorithm integrates multiple decision trees and chooses the classification with the most votes from the trees. Each tree depends on the values of a random vector sampled independently with the same distribution for all trees in the forest. In this study, we constructed the RF models with six different features, including binary encoding, EAAC encoding, AAindex encoding, CKSAAP encoding, PSSM encoding, and WE. The number of decision trees was selected as 580 <italic>via</italic> the grid search method. These classifiers were developed based on the Python module &#x201C;sklearn.&#x201D;</p>
</sec>
<sec id="S2.SS3.SSS5">
<title>The SVM Algorithms With Different Features</title>
<p>We applied the Python-based machine learning package &#x201C;scikit-learn&#x201D; to implement the SVM algorithm and adopted the &#x201C;RBF&#x201D; kernel function to build the SVM models. The above encoding schemes for RF were applied to the SVM model. In particular, we normalized the feature values that do not range between 0 and 1 (such as PSSM) before inputting the SVM model.</p>
</sec>
</sec>
<sec id="S2.SS4">
<title>Model Training Strategy</title>
<sec id="S2.SS4.SSS1">
<title>Optimization Methods for Hyper-Parameters</title>
<p>The hyper-parameters of an ML classifier affect prediction performance. Although a lot of combinations of hyper-parameters need to be tested, there are no formal rules to find optimal hyper-parameters. Here we applied two search approaches [grid search and Bayesian optimization (BO)] to automatical adjustment and evaluation of hyper-parameters (<xref ref-type="fig" rid="F3">Figure 3</xref>). Grid search is a brute-force method to find the optimal hyper-parameters by training models using each possible combination of hyper-parameters and retaining the hyper-parameters corresponding to the model with the best performance. This method applies to a limited number of hyper-parameters due to the exponential increase in time spent with the number of hyper-parameters. In this study, it was used for the RF-based and SVM-based models. The related grid search spaces (<xref ref-type="supplementary-material" rid="TS2">Supplementary Table 3</xref>) were searched using the GridSearchCV function of the sklearn library in Python. On the contrary, BO provides a principled technique based on Bayes theorem to direct a search of a global optimization problem, which is effective to tune the hyper-parameters of DL models. The BO strategy was executed using the fmin function of the hyperopt library in Python. The BO related hyper-parameter space contained 10 parameters, including window size, kernel size, and dropout rate (<xref ref-type="supplementary-material" rid="TS2">Supplementary Table 3</xref>). The optimal hyper-parameter combination results for the DL models were listed in <xref ref-type="supplementary-material" rid="TS2">Supplementary Table 4</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption><p>Hyper-parameter optimization procedure for machine-learning classifiers.</p></caption>
<graphic xlink:href="fcell-08-594587-g003.tif"/>
</fig>
</sec>
<sec id="S2.SS4.SSS2">
<title>Strategy of Avoiding Overfitting</title>
<p>1The parameters in the DL models were trained and optimized based on binary cross-entropy loss function using the Adam algorithm. The maximum of the training cycles was set through the optimized number of epochs to ensure that the loss function value converged. In each epoch, the training dataset was separated with the batch size as 512 and iterated. To avoid overfitting, the early-stopping strategy was applied, where the training process was stopped early when the training loss did not go down within 50 consecutive iterations. The model with the smallest training loss was saved as the best model. Moreover, the dropout rate of the neuron units was set, which was obtained through the hyper-parameter optimization. <xref ref-type="supplementary-material" rid="TS2">Supplementary Figures 3</xref>, <xref ref-type="fig" rid="F4">4</xref> showed the training and validation accuracy and loss curves of the LSTM<sub><italic>WE</italic></sub> models for different species.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption><p>The flowchart of the prediction model construction.</p></caption>
<graphic xlink:href="fcell-08-594587-g004.tif"/>
</fig>
</sec>
</sec>
<sec id="S2.SS5">
<title>Performance Assessment of the Predictors</title>
<p>Several measures were used to evaluate the prediction performance, including accuracy (ACC), specificity (SP), sensitivity (SN), Matthew&#x2019;s correlation coefficient (MCC). They are defined as follows:</p>
<disp-formula id="S2.Ex2">
<mml:math id="M4">
<mml:mrow>
<mml:mtext>ACC</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>TN</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>FP</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>TN</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="S2.Ex3">
<mml:math id="M5">
<mml:mrow>
<mml:mtext>SP</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mtext>TN</mml:mtext>
<mml:mrow>
<mml:mtext>TN</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="S2.Ex4">
<mml:math id="M6">
<mml:mrow>
<mml:mtext>SN</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mtext>TP</mml:mtext>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="S2.Ex5">
<mml:math id="M7">
<mml:mrow>
<mml:mtext>MCC</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x00D7;</mml:mo>
<mml:mtext>TN</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>-</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>FN</mml:mtext>
<mml:mo>&#x00D7;</mml:mo>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x00D7;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>TN</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x00D7;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x00D7;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mtext>TN</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>where TP, TN, FP, and FN represent true positives, true negatives, false positives, and false negatives, respectively. Additionally, because the number of positive and negative samples was unbalanced and the above measures were calculated based on the threshold value, a measure that was independent of the threshold value and unaffected by the sample ratio was needed. Therefore, the receiver operating characteristic (ROC) curve and AUC were employed to comprehensively evaluate classification performance. Specifically, due to the low false-positive rate of a predictor is significant in practical application, the area under the ROC curve with &#x003C;10% false-positive rate (AUC01) was considered.</p>
</sec>
<sec id="S2.SS6">
<title>Statistical Methods</title>
<p>The paired student&#x2019;s <italic>t</italic>-test was used to test the significant difference between the mean values of the two paired populations. The adjusted <italic>P</italic>-value with the Benjamini-Hochberg (BH) method was adopted for multiple comparisons.</p>
</sec>
<sec id="S2.SS7">
<title>The Flowchart of the Prediction Model Construction</title>
<p>The flowchart of the prediction model construction contained three steps (<xref ref-type="fig" rid="F4">Figure 4</xref>). This first step was data collection and preprocessing, in which the sample data were separated into the cross-validation dataset and the independent test dataset for model construction and evaluation. The second step was classifier construction, which involved data decoding, model training, and hyper-parameter adjustment for resulting in a robust predictive model. The third step was the development of the final model as an online prediction tool.</p>
</sec>
</sec>
<sec id="S3">
<title>Results and Discussion</title>
<sec id="S3.SS1">
<title>LSTM<sub><italic>WE</italic></sub> Classifier Performed Favorably to Other Classifiers</title>
<p>Many computational approaches for predicting PTM sites are generally based on traditional ML algorithms (e.g., RF and SVM) combined with various features encoded from peptide sequences. In this study, we constructed both RF-based and SVM-based predictors with different encoding schemes for the CSO site prediction. The encoding schemes include six features [i.e., binary, AAindex, WE, KSAAP, PSSM, and EAAC]. Moreover, deep learning algorithms have recently been applied to the field of PTM site prediction and demonstrated their superior performances (<xref ref-type="bibr" rid="B31">Wang et al., 2017</xref>; <xref ref-type="bibr" rid="B8">Chen et al., 2018b</xref>). Accordingly, we developed three different DL classifiers, named 1D-CNN<sub><italic>WE</italic></sub>, 2D-CNN<sub><italic>PSSM</italic></sub>, and LSTM<sub><italic>WE</italic></sub>.</p>
<p>We first took the Arabidopsis data to construct and compare different models (<xref ref-type="bibr" rid="B16">Huang et al., 2019</xref>). The Arabidopsis cross-validation dataset contained 8000 samples (1254 positives and 6746 negatives) and the independent test set covered 801 samples (126 positives and 675 negatives) (<xref ref-type="fig" rid="F1">Figure 1</xref>). We compared the performances of these algorithms in terms of several measures (e.g., ACC, MCC, AUC, and AUC01) for both the 10-fold cross-validation (<xref ref-type="table" rid="T1">Table 1</xref>) and the independent test (<xref ref-type="supplementary-material" rid="TS2">Supplementary Table 5</xref>). In the traditional ML models, RF<sub><italic>EAAC</italic></sub> showed superior performance than other RF-based and SVM-based models. The previous studies of CSO site prediction showed that the models with the combination of different encoding methods compared favorably to their counterparts with a single encoding approach (<xref ref-type="bibr" rid="B5">Bui et al., 2016b</xref>; <xref ref-type="bibr" rid="B35">Xu et al., 2016</xref>). Accordingly, we constructed such models and the RF model with the combination of EAAC, CKSAAP, and AAindex, dubbed RF<sub><italic>E+C+A</italic></sub>, had the best performance. To our surprise, RF<sub><italic>E+C+A</italic></sub> had inferior performance compared to RF<sub><italic>EAAC</italic></sub> (<xref ref-type="table" rid="T1">Table 1</xref> and <xref ref-type="supplementary-material" rid="TS2">Supplementary Table 5</xref>).</p>
<table-wrap position="float" id="T1">
<label>TABLE 1</label>
<caption><p>Performances of various classifiers for different species in terms of 10-fold cross-validation.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left"><bold>Classifier<sup>1</sup></bold></td>
<td valign="top" align="center"><bold>ACC<sup>2</sup></bold></td>
<td valign="top" align="center"><bold>Sn<sup>2</sup></bold></td>
<td valign="top" align="center"><bold>Sp<sup>2</sup></bold></td>
<td valign="top" align="center"><bold>MCC2<sup>2</sup></bold></td>
<td valign="top" align="center"><bold>AUC<sup>2</sup></bold></td>
<td valign="top" align="center"><bold>AUC01<sup>2</sup></bold></td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" colspan="7"><bold><italic>Arabidopsis thaliana</italic></bold></td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>BINARY</italic></sub></td>
<td valign="top" align="center">0.743 &#x00B1; 0.006</td>
<td valign="top" align="center">0.449 &#x00B1; 0.040</td>
<td valign="top" align="center">0.798 &#x00B1; 0.001</td>
<td valign="top" align="center">0.210 &#x00B1; 0.032</td>
<td valign="top" align="center">0.696 &#x00B1; 0.021</td>
<td valign="top" align="center">0.014 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>EAAC</italic></sub></td>
<td valign="top" align="center">0.773 &#x00B1; 0.007</td>
<td valign="top" align="center">0.628 &#x00B1; 0.043</td>
<td valign="top" align="center">0.799 &#x00B1; 0.001</td>
<td valign="top" align="center">0.351 &#x00B1; 0.033</td>
<td valign="top" align="center">0.803 &#x00B1; 0.019</td>
<td valign="top" align="center">0.024 &#x00B1; 0.004</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>WE</italic></sub></td>
<td valign="top" align="center">0.748 &#x00B1; 0.007</td>
<td valign="top" align="center">0.474 &#x00B1; 0.048</td>
<td valign="top" align="center">0.799 &#x00B1; 0.001</td>
<td valign="top" align="center">0.230 &#x00B1; 0.038</td>
<td valign="top" align="center">0.728 &#x00B1; 0.020</td>
<td valign="top" align="center">0.014 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>AAINDEX</italic></sub></td>
<td valign="top" align="center">0.744 &#x00B1; 0.008</td>
<td valign="top" align="center">0.443 &#x00B1; 0.053</td>
<td valign="top" align="center">0.800 &#x00B1; 0.001</td>
<td valign="top" align="center">0.206 &#x00B1; 0.043</td>
<td valign="top" align="center">0.710 &#x00B1; 0.025</td>
<td valign="top" align="center">0.014 &#x00B1; 0.004</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>CKSAAP</italic></sub></td>
<td valign="top" align="center">0.749 &#x00B1; 0.012</td>
<td valign="top" align="center">0.477 &#x00B1; 0.078</td>
<td valign="top" align="center">0.800 &#x00B1; 0.001</td>
<td valign="top" align="center">0.234 &#x00B1; 0.062</td>
<td valign="top" align="center">0.728 &#x00B1; 0.032</td>
<td valign="top" align="center">0.013 &#x00B1; 0.003</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>PSSM</italic></sub></td>
<td valign="top" align="center">0.740 &#x00B1; 0.006</td>
<td valign="top" align="center">0.419 &#x00B1; 0.039</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.188 &#x00B1; 0.032</td>
<td valign="top" align="center">0.670 &#x00B1; 0.028</td>
<td valign="top" align="center">0.015 &#x00B1; 0.004</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>E+C+A</italic></sub></td>
<td valign="top" align="center">0.760 &#x00B1; 0.006</td>
<td valign="top" align="center">0.544 &#x00B1; 0.040</td>
<td valign="top" align="center">0.800 &#x00B1; 0.001</td>
<td valign="top" align="center">0.287 &#x00B1; 0.031</td>
<td valign="top" align="center">0.770 &#x00B1; 0.016</td>
<td valign="top" align="center">0.020 &#x00B1; 0.005</td>
</tr>
<tr>
<td valign="top" align="left">SVM<sub><italic>BINARY</italic></sub></td>
<td valign="top" align="center">0.748 &#x00B1; 0.009</td>
<td valign="top" align="center">0.479 &#x00B1; 0.055</td>
<td valign="top" align="center">0.798 &#x00B1; 0.003</td>
<td valign="top" align="center">0.234 &#x00B1; 0.043</td>
<td valign="top" align="center">0.719 &#x00B1; 0.025</td>
<td valign="top" align="center">0.017 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">SVM<sub><italic>EAAC</italic></sub></td>
<td valign="top" align="center">0.746 &#x00B1; 0.009</td>
<td valign="top" align="center">0.458 &#x00B1; 0.060</td>
<td valign="top" align="center">0.799 &#x00B1; 0.001</td>
<td valign="top" align="center">0.218 &#x00B1; 0.048</td>
<td valign="top" align="center">0.704 &#x00B1; 0.026</td>
<td valign="top" align="center">0.015 &#x00B1; 0.004</td>
</tr>
<tr>
<td valign="top" align="left">SVM<sub><italic>AAINDEX</italic></sub></td>
<td valign="top" align="center">0.750 &#x00B1; 0.008</td>
<td valign="top" align="center">0.486 &#x00B1; 0.054</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.241 &#x00B1; 0.042</td>
<td valign="top" align="center">0.724 &#x00B1; 0.023</td>
<td valign="top" align="center">0.016 &#x00B1; 0.004</td>
</tr>
<tr>
<td valign="top" align="left">SVM<sub><italic>CKSAAP</italic></sub></td>
<td valign="top" align="center">0.739 &#x00B1; 0.007</td>
<td valign="top" align="center">0.421 &#x00B1; 0.047</td>
<td valign="top" align="center">0.798 &#x00B1; 0.003</td>
<td valign="top" align="center">0.187 &#x00B1; 0.037</td>
<td valign="top" align="center">0.692 &#x00B1; 0.030</td>
<td valign="top" align="center">0.013 &#x00B1; 0.003</td>
</tr>
<tr>
<td valign="top" align="left">SVM<sub><italic>PSSM</italic></sub></td>
<td valign="top" align="center">0.726 &#x00B1; 0.008</td>
<td valign="top" align="center">0.330 &#x00B1; 0.054</td>
<td valign="top" align="center">0.800 &#x00B1; 0.001</td>
<td valign="top" align="center">0.113 &#x00B1; 0.046</td>
<td valign="top" align="center">0.590 &#x00B1; 0.025</td>
<td valign="top" align="center">0.009 &#x00B1; 0.003</td>
</tr>
<tr>
<td valign="top" align="left">2D-CNN<sub><italic>PSSM</italic></sub></td>
<td valign="top" align="center">0.766 &#x00B1; 0.010</td>
<td valign="top" align="center">0.585 &#x00B1; 0.064</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.319 &#x00B1; 0.050</td>
<td valign="top" align="center">0.781 &#x00B1; 0.030</td>
<td valign="top" align="center">0.023 &#x00B1; 0.004</td>
</tr>
<tr>
<td valign="top" align="left">1D-CNN<sub><italic>WE</italic></sub></td>
<td valign="top" align="center">0.783 &#x00B1; 0.006</td>
<td valign="top" align="center">0.696 &#x00B1; 0.041</td>
<td valign="top" align="center">0.799 &#x00B1; 0.001</td>
<td valign="top" align="center">0.401 &#x00B1; 0.030</td>
<td valign="top" align="center">0.838 &#x00B1; 0.019</td>
<td valign="top" align="center">0.029 &#x00B1; 0.005</td>
</tr>
<tr>
<td valign="top" align="left"><bold>LSTM<sub><italic>WE</italic></sub></bold></td>
<td valign="top" align="center"><bold>0.786 &#x00B1; 0.007</bold></td>
<td valign="top" align="center"><bold>0.717 &#x00B1; 0.044</bold></td>
<td valign="top" align="center"><bold>0.799 &#x00B1; 0.001</bold></td>
<td valign="top" align="center"><bold>0.417 &#x00B1; 0.032</bold></td>
<td valign="top" align="center"><bold>0.852 &#x00B1; 0.018</bold></td>
<td valign="top" align="center"><bold>0.030 &#x00B1; 0.006</bold></td>
</tr>
<tr>
<td valign="top" align="left" colspan="7"><bold><italic>Homo sapiens</italic></bold></td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>BINARY</italic></sub></td>
<td valign="top" align="center">0.749 &#x00B1; 0.004</td>
<td valign="top" align="center">0.466 &#x00B1; 0.027</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.225 &#x00B1; 0.021</td>
<td valign="top" align="center">0.720 &#x00B1; 0.013</td>
<td valign="top" align="center">0.016 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>EAAC</italic></sub></td>
<td valign="top" align="center">0.766 &#x00B1; 0.006</td>
<td valign="top" align="center">0.578 &#x00B1; 0.039</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.312 &#x00B1; 0.030</td>
<td valign="top" align="center">0.790 &#x00B1; 0.018</td>
<td valign="top" align="center">0.020 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>WE</italic></sub></td>
<td valign="top" align="center">0.751 &#x00B1; 0.004</td>
<td valign="top" align="center">0.480 &#x00B1; 0.024</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.236 &#x00B1; 0.019</td>
<td valign="top" align="center">0.732 &#x00B1; 0.015</td>
<td valign="top" align="center">0.018 &#x00B1; 0.001</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>AAINDEX</italic></sub></td>
<td valign="top" align="center">0.750 &#x00B1; 0.004</td>
<td valign="top" align="center">0.474 &#x00B1; 0.025</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.231 &#x00B1; 0.020</td>
<td valign="top" align="center">0.734 &#x00B1; 0.017</td>
<td valign="top" align="center">0.018 &#x00B1; 0.003</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>CKSAAP</italic></sub></td>
<td valign="top" align="center">0.753 &#x00B1; 0.003</td>
<td valign="top" align="center">0.493 &#x00B1; 0.018</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.246 &#x00B1; 0.014</td>
<td valign="top" align="center">0.729 &#x00B1; 0.016</td>
<td valign="top" align="center">0.016 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>PSSM</italic></sub></td>
<td valign="top" align="center">0.748 &#x00B1; 0.004</td>
<td valign="top" align="center">0.462 &#x00B1; 0.026</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.222 &#x00B1; 0.021</td>
<td valign="top" align="center">0.707 &#x00B1; 0.016</td>
<td valign="top" align="center">0.016 &#x00B1; 0.001</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>E+S+A</italic></sub></td>
<td valign="top" align="center">0.761 &#x00B1; 0.005</td>
<td valign="top" align="center">0.551 &#x00B1; 0.033</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.291 &#x00B1; 0.026</td>
<td valign="top" align="center">0.774 &#x00B1; 0.012</td>
<td valign="top" align="center">0.021 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">SVM<sub><italic>BINARY</italic></sub></td>
<td valign="top" align="center">0.750 &#x00B1; 0.005</td>
<td valign="top" align="center">0.474 &#x00B1; 0.030</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.231 &#x00B1; 0.024</td>
<td valign="top" align="center">0.720 &#x00B1; 0.013</td>
<td valign="top" align="center">0.017 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">SVM<sub><italic>EAAC</italic></sub></td>
<td valign="top" align="center">0.742 &#x00B1; 0.007</td>
<td valign="top" align="center">0.421 &#x00B1; 0.049</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.188 &#x00B1; 0.039</td>
<td valign="top" align="center">0.680 &#x00B1; 0.021</td>
<td valign="top" align="center">0.013 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">SVM<sub><italic>AAINDEX</italic></sub></td>
<td valign="top" align="center">0.753 &#x00B1; 0.006</td>
<td valign="top" align="center">0.498 &#x00B1; 0.041</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.250 &#x00B1; 0.032</td>
<td valign="top" align="center">0.737 &#x00B1; 0.021</td>
<td valign="top" align="center">0.017 &#x00B1; 0.001</td>
</tr>
<tr>
<td valign="top" align="left">SVM<sub><italic>CKSAAP</italic></sub></td>
<td valign="top" align="center">0.737 &#x00B1; 0.005</td>
<td valign="top" align="center">0.388 &#x00B1; 0.031</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.162 &#x00B1; 0.025</td>
<td valign="top" align="center">0.664 &#x00B1; 0.012</td>
<td valign="top" align="center">0.012 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">SVM<sub><italic>PSSM</italic></sub></td>
<td valign="top" align="center">0.725 &#x00B1; 0.005</td>
<td valign="top" align="center">0.316 &#x00B1; 0.033</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.101 &#x00B1; 0.028</td>
<td valign="top" align="center">0.578 &#x00B1; 0.025</td>
<td valign="top" align="center">0.011 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">2D-CNN<sub><italic>PSSM</italic></sub></td>
<td valign="top" align="center">0.766 &#x00B1; 0.004</td>
<td valign="top" align="center">0.581 &#x00B1; 0.029</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.314 &#x00B1; 0.022</td>
<td valign="top" align="center">0.777 &#x00B1; 0.011</td>
<td valign="top" align="center">0.022 &#x00B1; 0.003</td>
</tr>
<tr>
<td valign="top" align="left">1D-CNN<sub><italic>WE</italic></sub></td>
<td valign="top" align="center">0.778 &#x00B1; 0.006</td>
<td valign="top" align="center">0.659 &#x00B1; 0.036</td>
<td valign="top" align="center">0.800 &#x00B1; 0.000</td>
<td valign="top" align="center">0.373 &#x00B1; 0.027</td>
<td valign="top" align="center">0.819 &#x00B1; 0.012</td>
<td valign="top" align="center">0.024 &#x00B1; 0.003</td>
</tr>
<tr>
<td valign="top" align="left"><bold>LSTM<sub><italic>WE</italic></sub></bold></td>
<td valign="top" align="center"><bold>0.777 &#x00B1; 0.006</bold></td>
<td valign="top" align="center"><bold>0.651 &#x00B1; 0.038</bold></td>
<td valign="top" align="center"><bold>0.800 &#x00B1; 0.000</bold></td>
<td valign="top" align="center"><bold>0.367 &#x00B1; 0.028</bold></td>
<td valign="top" align="center"><bold>0.822 &#x00B1; 0.011</bold></td>
<td valign="top" align="center"><bold>0.024 &#x00B1; 0.003</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<attrib><italic><sup>1</sup>The RF classifiers with the different features were named as RF<sub><italic>BINARY</italic></sub>, RF<sub><italic>WE</italic></sub>, etc. The 1D CNN and LSTM classifiers with the word embedding approach were named as 1D-CNN<sub><italic>WE</italic></sub> and LSTM<sub><italic>WE</italic></sub>, respectively.<sup>2</sup>ACC, Sn, Sp, MCC, AUC, and AUC01 were described in section &#x201C;Materials and Methods.&#x201D; In the 10-fold cross-validation, 10 models were constructed using the 10 different validation datasets. Finally, the average performance and standard deviation of the 10 models were calculated for the cross-validation dataset. The models with the best performances were highlighted in bold.</italic></attrib>
</table-wrap-foot>
</table-wrap>
<p>All the models constructed above were based on the imbalanced dataset. To evaluate the effect of the imbalanced dataset on potential overfitting of the classifiers, we reconstructed RF<sub><italic>EAAC</italic></sub> based on the balanced positive and negative samples. Specifically, because the number of negative samples was around five times larger than that of the positive samples, we randomly separated the negative samples into five parts and created five subsets of training data with a 1:1 positive-to-negative ratio. Subsequently, five RF<sub><italic>EAAC</italic></sub> models (sub-classifiers) were trained and the average output score from the five sub-classifiers was taken as the final prediction score. <xref ref-type="supplementary-material" rid="TS2">Supplementary Figure 5</xref> showed the performances of the two RF<sub><italic>EAAC</italic></sub> models based on the balanced and imbalanced dataset, respectively, in terms of the 10-fold cross-validation and the independent test dataset. Because of the slightly better performance of the RF<sub><italic>EAAC</italic></sub> model constructed using an imbalanced training dataset, we selected the imbalanced dataset for the construction of the models.</p>
<p>In our previous studies, DL models showed superior performance than traditional ML models (<xref ref-type="bibr" rid="B8">Chen et al., 2018b</xref>; <xref ref-type="bibr" rid="B39">Zhao et al., 2020</xref>). It is still true for the CSO site prediction. LSTM<sub><italic>WE</italic></sub> had the best performance among these constructed models in terms of ACC, Sn, MCC, and AUC values for both 10-fold cross-validation and independent test. For instance, its AUC value is 0.852 for the cross-validation and its values of ACC, Sn, Sp, and MCC were 0.786, 0.717, 0.799, and 0.417, respectively (<xref ref-type="table" rid="T1">Table 1</xref> and <xref ref-type="fig" rid="F5">Figures 5A,C</xref>). As prediction performance at a low false-positive rate is highly useful in practice, we estimated these predictors using AUC01, where the specificity was determined to be &#x003E;90%. LSTM<sub><italic>WE</italic></sub> again showed the largest AUC01 values for both 10-fold cross-validation and the independent test (<xref ref-type="fig" rid="F5">Figures 5B,D</xref>). As the encoding approach has a great impact on the traditional ML models (<xref ref-type="bibr" rid="B8">Chen et al., 2018b</xref>; <xref ref-type="bibr" rid="B18">Huang Y. et al., 2018</xref>; <xref ref-type="bibr" rid="B39">Zhao et al., 2020</xref>) and the WE approach integrated with LSTM had the best performance in this study, we attempted to investigate whether the integration of WE and RF had a good performance. Accordingly, we extracted WE layer vector as feature encoding from LSTM<sub><italic>WE</italic></sub> and trained the RF model, dubbed RF<sub><italic>WE</italic></sub>. Interestingly, RF<sub><italic>WE</italic></sub> did not show good performance compared to RF<sub><italic>EAAC</italic></sub>, 1D-CNN<sub><italic>WE</italic></sub>, or LSTM<sub><italic>WE</italic></sub>. It suggests that the WE encoding approach may be improper for the construction of traditional ML algorithms.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption><p>Performance comparison of different CSO predictors on <italic>Arabidopsis thaliana</italic>. The performances of CSO predictors were compared in terms of AUC <bold>(A)</bold> and AUC01 <bold>(B)</bold>, respectively, for 10-fold cross-validation. AUC <bold>(C)</bold> and AUC01 <bold>(D)</bold> curves were generated using the independent test.</p></caption>
<graphic xlink:href="fcell-08-594587-g005.tif"/>
</fig>
<p>We further constructed the models for the human organism. The Humans cross-validation dataset contained 16,249 samples (2507 positives and 13,742 negatives) and the independent test set covered 1625 samples (251 positives and 1374 negatives) (<xref ref-type="fig" rid="F1">Figure 1B</xref>). Similarly, LSTM<sub><italic>WE</italic></sub> had the best performance (<xref ref-type="table" rid="T1">Table 1</xref>, <xref ref-type="supplementary-material" rid="TS2">Supplementary Table 5</xref>, and <xref ref-type="supplementary-material" rid="TS2">Supplementary Figure 6</xref>). For instance, its values of AUC, ACC, Sn, Sp, MCC, and AUC01 for the 10-fold cross-validation were 0.822, 0.777, 0.651, 0.800, 0.367, and 0.024, respectively. We evaluated the robustness of LSTM<sub><italic>WE</italic></sub> by comparing their performances between the cross-validation and independent tests for individual organisms. As their performances were not statistically different for each organism (<italic>P</italic> = 0.18/0.085 for the arabidopsis/humans, respectively), we concluded that the constructed models were robust and neither over-fitting nor under-fitting.</p>
</sec>
<sec id="S3.SS2">
<title>LSTM<sub><italic>WE</italic></sub> Performed Better Than Reported Classifiers</title>
<p>Six approaches for the prediction of human CSO sites were based on 1105 identified human CSO sites (<xref ref-type="bibr" rid="B36">Yang et al., 2014</xref>), including MDD-SOH, SOHSite, SOHPRED, iSulf-Cys, SulCysSite, and Sulf_FSVM. We compare these models and our models (i.e., RF<sub><italic>EAAC</italic></sub>, RF<sub><italic>E+C+A</italic></sub>, and LSTM<sub><italic>WE</italic></sub>) to evaluate their prediction performances. Accordingly, we constructed our models using the same dataset derived from the original study (<xref ref-type="bibr" rid="B36">Yang et al., 2014</xref>). SulCysSite, LSTM<sub><italic>WE</italic></sub>, and RF<sub><italic>E+C+A</italic></sub> had the best and similar performances (<xref ref-type="table" rid="T2">Table 2</xref>). The observation that the model with the combined features (i.e., RF<sub><italic>E+C+A</italic></sub>) had better accuracy than the counterpart with a single feature (i.e., RF<sub><italic>EAAC</italic></sub>) is consistent with the previous studies (<xref ref-type="bibr" rid="B5">Bui et al., 2016b</xref>; <xref ref-type="bibr" rid="B35">Xu et al., 2016</xref>) but conflicted with our observation above that RF<sub><italic>EAAC</italic></sub> compared favorably to RF<sub><italic>E+C+A</italic></sub>. This contradiction derived from the different amounts of the training datasets, where the dataset here was smaller than the datasets described above, indicating that the amount of training data affected the performance of the models. Indeed, based on the small human dataset (1105 positives), RF<sub><italic>E+C+A</italic></sub> had a better performance than RF<sub><italic>EAAC</italic></sub>, whereas the performance of RF<sub><italic>EAAC</italic></sub> was better than that of RF<sub><italic>E+C+A</italic></sub> with a large amount of the training set (arabidopsis: 1380 positives; human 2758 positives) (<xref ref-type="supplementary-material" rid="TS2">Supplementary Figure 7</xref>). In all comparisons, LSTM<sub><italic>WE</italic></sub> showed the best performance (<xref ref-type="supplementary-material" rid="TS2">Supplementary Figure</xref>). Additionally, as iSulf-Cys (<xref ref-type="bibr" rid="B35">Xu et al., 2016</xref>) is the only accessible model to date, we compared it and LSTM<sub><italic>WE</italic></sub> using the human independent dataset of this study. The AUC value (0.839) of LSTM<sub><italic>WE</italic></sub> is significantly larger than that (0.666) of iSulf-Cys (<xref ref-type="supplementary-material" rid="TS2">Supplementary Figure 8</xref>). In summary, LSTM<sub><italic>WE</italic></sub> performed better than reported classifiers.</p>
<table-wrap position="float" id="T2">
<label>TABLE 2</label>
<caption><p>The k-fold cross-validation results of existed tools.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left"><bold>Tools&#x002A;</bold></td>
<td valign="top" align="center"><bold>Fold</bold></td>
<td valign="top" align="center"><bold>Accuracy</bold></td>
<td valign="top" align="center"><bold>Sensitivity</bold></td>
<td valign="top" align="center"><bold>Specificity</bold></td>
<td valign="top" align="center"><bold>AUC</bold></td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">MDD-SOH</td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">0.68</td>
<td valign="top" align="center">0.7</td>
<td valign="top" align="center">0.7</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">SOHSite</td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">0.71</td>
<td valign="top" align="center">0.72</td>
<td valign="top" align="center">0.72</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">SOHPRED</td>
<td valign="top" align="center">5</td>
<td/>
<td valign="top" align="center">0.727 &#x00B1; 0.005</td>
<td valign="top" align="center">0.742 &#x00B1; 0.001</td>
<td valign="top" align="center">0.801 &#x00B1; 0.001</td>
</tr>
<tr>
<td valign="top" align="left">iSulf-Cys</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">0.656 &#x00B1; 0.007</td>
<td valign="top" align="center">0.673 &#x00B1; 0.007</td>
<td valign="top" align="center">0.639 &#x00B1; 0.001</td>
<td valign="top" align="center">0.716 &#x00B1; 0.009</td>
</tr>
<tr>
<td valign="top" align="left">SulCysSite</td>
<td valign="top" align="center">10</td>
<td/>
<td valign="top" align="center">0.745 &#x00B1; 0.006</td>
<td valign="top" align="center">0.744 &#x00B1; 0.002</td>
<td valign="top" align="center">0.806 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">Sulf_FSVM</td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">0.711 &#x00B1; 0.002</td>
<td valign="top" align="center">0.733 &#x00B1; 0.004</td>
<td valign="top" align="center">0.708 &#x00B1; 0.002</td>
<td valign="top" align="center">0.788 &#x00B1; 0.002</td>
</tr>
<tr>
<td valign="top" align="left">LSTM<sub><italic>WE</italic></sub></td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">0.739 &#x00B1; 0.006</td>
<td valign="top" align="center">0.694 &#x00B1; 0.042</td>
<td valign="top" align="center">0.744 &#x00B1; 0.008</td>
<td valign="top" align="center">0.800 &#x00B1; 0.011</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>EAAC</italic></sub></td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">0.733 &#x00B1; 0.006</td>
<td valign="top" align="center">0.607 &#x00B1; 0.021</td>
<td valign="top" align="center">0.750 &#x00B1; 0.007</td>
<td valign="top" align="center">0.753 &#x00B1; 0.006</td>
</tr>
<tr>
<td valign="top" align="left">RF<sub><italic>E+S+A</italic></sub></td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">0.743 &#x00B1; 0.009</td>
<td valign="top" align="center">0.728 &#x00B1; 0.027</td>
<td valign="top" align="center">0.745 &#x00B1; 0.009</td>
<td valign="top" align="center">0.807 &#x00B1; 0.010</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<attrib><italic>&#x002A;The cross-validation dataset was derived from Yang&#x2019;s publication (<xref ref-type="bibr" rid="B36">Yang et al., 2014</xref>).</italic></attrib>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="S3.SS3">
<title>Conservation of the CSO Modification and the Development of General LSTM<sub><italic>WE</italic></sub> Models</title>
<p>Cysteine S-sulphenylation has been identified across various organisms, ranging from yeasts to worms and from plants to humans (<xref ref-type="bibr" rid="B22">Men and Wang, 2007</xref>; <xref ref-type="bibr" rid="B15">Hourihan et al., 2016</xref>). To understand its conservation, we compared the characteristics of CSO-containing peptides in human and arabidopsis species, respectively, using the two-sample-logo approach (<xref ref-type="bibr" rid="B27">Vacic et al., 2006</xref>). <xref ref-type="fig" rid="F6">Figure 6</xref> showed that both species shared the enriched basic amino acids R and K and the depleted polar neutral amino acid C. Nevertheless, the amino acid H was enriched for <italic>A. thaliana</italic> whereas the hydrophobic amino acid L was depleted for <italic>H. sapiens</italic>. As the characteristics of CSO-containing peptides were similar between both species, we hypothesized the generalization ability of our developed models. To test this hypothesis, we used the human LSTM<sub><italic>WE</italic></sub> model to predict the arabidopsis independent test dataset and employed the Arabidopsis LSTM<sub><italic>WE</italic></sub> model to predict the human independent test dataset. The AUC values were 0.799 and 0.766, respectively, significantly larger than the random prediction (i.e., AUC = 0.5; <xref ref-type="table" rid="T3">Table 3</xref>). Nevertheless, the cross-species prediction had relatively low performance compared to the self-species prediction (AUC = 0.876/0.839 for arabidopsis/human, respectively). As the CSO sites were systematically analyzed in a few species, we developed a general CSO prediction model according to its conservation to boost the investigation for other species. Accordingly, we mixed the training datasets of <italic>H. Sapiens</italic> and <italic>A. thaliana</italic> and constructed the general LSTM<sub><italic>WE</italic></sub> model and validated it using the independent datasets from both organisms. The performance of the general LSTM<sub><italic>WE</italic></sub> model was slightly lower than that of the self-species prediction, which may be caused by the interference of the CSO characteristics of other species (<xref ref-type="table" rid="T3">Table 3</xref>). Overall, the conservation of the CSO modification leads to the effective prediction of the general LSTM<sub><italic>WE</italic></sub> classifier.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption><p>Sequence pattern surrounding the CSO sites, including the significantly enriched and depleted residues based on CSO-containing peptides and non-modification peptides for <italic>H. sapiens</italic> <bold>(A)</bold> and <italic>A. thaliana</italic> <bold>(B)</bold> (<italic>P</italic> &#x003C; 0.05, <italic>t</italic>-test with Bonferroni correction). The pattern was generated using the two-sample-logo method (<xref ref-type="bibr" rid="B27">Vacic et al., 2006</xref>).</p></caption>
<graphic xlink:href="fcell-08-594587-g006.tif"/>
</fig>
<table-wrap position="float" id="T3">
<label>TABLE 3</label>
<caption><p>Evaluation of species-specific and general LSTM<sub><italic>WE</italic></sub> models using the independent test sets from different species.</p></caption>
<table cellspacing="5" cellpadding="5" frame="hsides" rules="groups">
<thead>
<tr>
<td valign="top" align="left"><bold>Independent test sets</bold></td>
<td valign="top" align="center" colspan="3"><bold>LSTM<sub><italic>WE</italic></sub> model (AUC value)</bold><hr/></td>
</tr>
<tr>
<td/>
<td valign="top" align="center"><bold>Arabidopsis-specific</bold></td>
<td valign="top" align="center"><bold>Human-specific</bold></td>
<td valign="top" align="center"><bold>General</bold></td>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><italic>A. thaliana</italic></td>
<td valign="top" align="center">0.876</td>
<td valign="top" align="center">0.799</td>
<td valign="top" align="center">0.863</td>
</tr>
<tr>
<td valign="top" align="left"><italic>H. sapiens</italic></td>
<td valign="top" align="center">0.766</td>
<td valign="top" align="center">0.839</td>
<td valign="top" align="center">0.834</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To further understand the performance of the general LSTM<sub><italic>WE</italic></sub> classifier, we visualized the sample distribution, based on the human independent dataset, from the outputs of the input layer, WE layer, LSTM layer, and dense layer of the general model using the t-SNE algorithm (<xref ref-type="bibr" rid="B29">van der Maaten and Hinton, 2008</xref>; <xref ref-type="fig" rid="F7">Figure 7</xref>). After the input layer (<xref ref-type="fig" rid="F7">Figure 7A</xref>), the positive and negative samples were mixed, as the training goes on (<xref ref-type="fig" rid="F7">Figures 7B,C</xref>), positive and negative samples were gradually separated. After the LSTM layer, they were separated (<xref ref-type="fig" rid="F7">Figure 7D</xref>). This comparison indicates that the LSTM layer is a powerful method to detect the distinctive features of the positives and negatives. A similar observation is made for the arabidopsis independent test dataset (<xref ref-type="supplementary-material" rid="TS2">Supplementary Figure 9</xref>).</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption><p>T-SNE visualization of the distributions of peptides in the human independent dataset for the outputs of input layer <bold>(A)</bold>, word embedding layer <bold>(B)</bold>, LSTM layer <bold>(C)</bold>, and dense layer <bold>(D)</bold> of the general LSTM<sub><italic>WE</italic></sub> model.</p></caption>
<graphic xlink:href="fcell-08-594587-g007.tif"/>
</fig>
</sec>
<sec id="S3.SS4">
<title>Construction of the Online CSO Predictor</title>
<p>We developed an easy-to-use online tool for the prediction of the CSO sites, dubbed DeepCSO. DeepCSO contains three LSTM<sub><italic>WE</italic></sub> models: the general model and two species-specific models (i.e., <italic>H. sapiens</italic> and <italic>A. thaliana</italic>). The users could select the general model or species-specific model at the input interface and input the query protein sequences directly or upload the sequence file. After the job submission, the prediction will start and the prediction process may take several minutes. Finally, the prediction results are output in tabular form with five columns: sequence header, position, sequence, prediction score, and prediction results at the specificity levels of 80, 85, and 90%, respectively.</p>
<p>Several Cysteine modification types have been reported in the human organism, such as carbonylation (<xref ref-type="bibr" rid="B30">Wang et al., 2014</xref>; <xref ref-type="bibr" rid="B6">Chen et al., 2017</xref>, <xref ref-type="bibr" rid="B7">2018a</xref>; <xref ref-type="bibr" rid="B37">Zhang S. et al., 2019</xref>), oxidation (<xref ref-type="bibr" rid="B13">Gupta et al., 2017</xref>; <xref ref-type="bibr" rid="B2">Akter et al., 2018</xref>), succination (<xref ref-type="bibr" rid="B1">Adam et al., 2017</xref>), and sulfenylation. Some Cysteine sites can be modified with multiple modification types, which cause PTM cross-regulation. To examine potential PTM cross-regulation at the proteome scale, we downloaded the latest human protein sequences from the Swiss-Prot database (version: 2020_05) and applied the human DeepCSO predictor to predict the potential CSO sites with the annotation of the reported Cysteine modifications (<xref ref-type="supplementary-material" rid="TS2">Supplementary Table 6</xref>). This resource will assist in the investigation of the Cystine co-regulation in the community.</p>
</sec>
</sec>
<sec id="S4">
<title>Conclusion</title>
<p>The current prediction tools for CSO sites are based on traditional ML methodology that requires experts to pre-define informative features, and no prediction tool has been developed for other than the human organism. In this study, three LSTM-based prediction models were constructed, where two were organism-specific and one was general, and they compared favorably to the reported models. Despite lacking pre-defined features, the deep learning classifier demonstrated superior performance compared to the traditional machine learning methods. This may be due to the self-learning ability of deep learning. The outstanding performance of the general model suggests that the CSO is well conserved and the LSTM-based model has an advantage in long-term memory to capture the key features of the entire sequences.</p>
</sec>
<sec id="S5">
<title>Data Availability Statement</title>
<p>The 10-fold cross-validation and independent data sets can be found in <ext-link ext-link-type="uri" xlink:href="http://www.bioinfogo.org/DeepCSO/">http://www.bioinfogo.org/DeepCSO/</ext-link>.</p>
</sec>
<sec id="S6">
<title>Author Contributions</title>
<p>LL conceived this project. XL and SL constructed the algorithms under the supervision of LL and YZ.; CJ, XL, and NH analyzed the data. XL, YZ, NH, and ZC, and LL wrote the manuscript. All authors read and approved the final manuscript.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
</body>
<back>
<fn-group>
<fn fn-type="financial-disclosure">
<p><bold>Funding.</bold> This work was supported in part by funds from the Young Scientists Fund of the National Natural Science Foundation of China (Grant No. 31701142 to ZC), and the National Natural Science Foundation of China (Grant Nos. 31770821 and 32071430 to LL); LL was supported by the &#x201C;Distinguished Expert of Overseas Tai Shan Scholar&#x201D; program. YZ was supported by the Qingdao Applied Research Project.</p>
</fn>
</fn-group>
<sec id="S8" sec-type="supplementary material"><title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fcell.2020.594587/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fcell.2020.594587/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.XLSX" id="TS1" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table_2.DOCX" id="TS2" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Adam</surname> <given-names>J.</given-names></name> <name><surname>Ramracheya</surname> <given-names>R.</given-names></name> <name><surname>Chibalina</surname> <given-names>M. V.</given-names></name> <name><surname>Ternette</surname> <given-names>N.</given-names></name> <name><surname>Hamilton</surname> <given-names>A.</given-names></name> <name><surname>Tarasov</surname> <given-names>A. I.</given-names></name><etal/></person-group> (<year>2017</year>). <article-title>Fumarate hydratase deletion in pancreatic beta cells leads to progressive diabetes.</article-title> <source><italic>Cell Rep.</italic></source> <volume>20</volume> <fpage>3135</fpage>&#x2013;<lpage>3148</lpage>. <pub-id pub-id-type="doi">10.1016/j.celrep.2017.08.093</pub-id> <pub-id pub-id-type="pmid">28954230</pub-id></citation></ref>
<ref id="B2"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Akter</surname> <given-names>S.</given-names></name> <name><surname>Fu</surname> <given-names>L.</given-names></name> <name><surname>Jung</surname> <given-names>Y.</given-names></name> <name><surname>Conte</surname> <given-names>M. L.</given-names></name> <name><surname>Lawson</surname> <given-names>J. R.</given-names></name> <name><surname>Lowther</surname> <given-names>W. T.</given-names></name><etal/></person-group> (<year>2018</year>). <article-title>Chemical proteomics reveals new targets of cysteine sulfinic acid reductase.</article-title> <source><italic>Nat. Chem. Biol.</italic></source> <volume>14</volume> <fpage>995</fpage>&#x2013;<lpage>1004</lpage>. <pub-id pub-id-type="doi">10.1038/s41589-018-0116-2</pub-id> <pub-id pub-id-type="pmid">30177848</pub-id></citation></ref>
<ref id="B3"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bhasin</surname> <given-names>M.</given-names></name> <name><surname>Raghava</surname> <given-names>G. P.</given-names></name></person-group> (<year>2004</year>). <article-title>Classification of nuclear receptors based on amino acid composition and dipeptide composition.</article-title> <source><italic>J. Biol. Chem.</italic></source> <volume>279</volume> <fpage>23262</fpage>&#x2013;<lpage>23266</lpage>. <pub-id pub-id-type="doi">10.1074/jbc.m401932200</pub-id> <pub-id pub-id-type="pmid">15039428</pub-id></citation></ref>
<ref id="B4"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bui</surname> <given-names>V. M.</given-names></name> <name><surname>Lu</surname> <given-names>C. T.</given-names></name> <name><surname>Ho</surname> <given-names>T. T.</given-names></name> <name><surname>Lee</surname> <given-names>T. Y.</given-names></name></person-group> (<year>2016a</year>). <article-title>MDD-SOH: exploiting maximal dependence decomposition to identify S-sulfenylation sites with substrate motifs.</article-title> <source><italic>Bioinformatics</italic></source> <volume>32</volume> <fpage>165</fpage>&#x2013;<lpage>172</lpage>.</citation></ref>
<ref id="B5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bui</surname> <given-names>V. M.</given-names></name> <name><surname>Weng</surname> <given-names>S. L.</given-names></name> <name><surname>Lu</surname> <given-names>C. T.</given-names></name> <name><surname>Chang</surname> <given-names>T. H.</given-names></name> <name><surname>Weng</surname> <given-names>J. T.</given-names></name> <name><surname>Lee</surname> <given-names>T. Y.</given-names></name></person-group> (<year>2016b</year>). <article-title>SOHSite: incorporating evolutionary information and physicochemical properties to identify protein S-sulfenylation sites.</article-title> <source><italic>BMC Genomics</italic></source> <volume>17</volume><issue>(Suppl. 1):9</issue>. <pub-id pub-id-type="doi">10.1186/s12864-015-2299-1</pub-id> <pub-id pub-id-type="pmid">26819243</pub-id></citation></ref>
<ref id="B6"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Cong</surname> <given-names>Y.</given-names></name> <name><surname>Quan</surname> <given-names>B.</given-names></name> <name><surname>Lan</surname> <given-names>T.</given-names></name> <name><surname>Chu</surname> <given-names>X.</given-names></name> <name><surname>Ye</surname> <given-names>Z.</given-names></name><etal/></person-group> (<year>2017</year>). <article-title>Chemoproteomic profiling of targets of lipid-derived electrophiles by bioorthogonal aminooxy probe.</article-title> <source><italic>Redox Biol.</italic></source> <volume>12</volume> <fpage>712</fpage>&#x2013;<lpage>718</lpage>. <pub-id pub-id-type="doi">10.1016/j.redox.2017.04.001</pub-id> <pub-id pub-id-type="pmid">28411555</pub-id></citation></ref>
<ref id="B7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Lan</surname> <given-names>T.</given-names></name> <name><surname>Qin</surname> <given-names>W.</given-names></name> <name><surname>Zhu</surname> <given-names>Y.</given-names></name> <name><surname>Qin</surname> <given-names>K.</given-names></name><etal/></person-group> (<year>2018a</year>). <article-title>Quantitative profiling of protein carbonylations in ferroptosis by an aniline-derived probe.</article-title> <source><italic>J. Am. Chem. Soc.</italic></source> <volume>140</volume> <fpage>4712</fpage>&#x2013;<lpage>4720</lpage>. <pub-id pub-id-type="doi">10.1021/jacs.8b01462</pub-id> <pub-id pub-id-type="pmid">29569437</pub-id></citation></ref>
<ref id="B8"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Z.</given-names></name> <name><surname>He</surname> <given-names>N.</given-names></name> <name><surname>Huang</surname> <given-names>Y.</given-names></name> <name><surname>Qin</surname> <given-names>W. T.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>L.</given-names></name></person-group> (<year>2018b</year>). <article-title>Integration of a deep learning classifier with a random forest approach for predicting malonylation sites.</article-title> <source><italic>Genomics Proteomics Bioinform.</italic></source> <volume>16</volume> <fpage>451</fpage>&#x2013;<lpage>459</lpage>. <pub-id pub-id-type="doi">10.1016/j.gpb.2018.08.004</pub-id> <pub-id pub-id-type="pmid">30639696</pub-id></citation></ref>
<ref id="B9"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Z.</given-names></name> <name><surname>Zhao</surname> <given-names>P.</given-names></name> <name><surname>Li</surname> <given-names>F.</given-names></name> <name><surname>Leier</surname> <given-names>A.</given-names></name> <name><surname>Marquez-Lago</surname> <given-names>T. T.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name><etal/></person-group> (<year>2018c</year>). <article-title>iFeature: a Python package and web server for features extraction and selection from protein and peptide sequences.</article-title> <source><italic>Bioinformatics</italic></source> <volume>34</volume> <fpage>2499</fpage>&#x2013;<lpage>2502</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bty140</pub-id> <pub-id pub-id-type="pmid">29528364</pub-id></citation></ref>
<ref id="B10"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Z.</given-names></name> <name><surname>Zhao</surname> <given-names>P.</given-names></name> <name><surname>Li</surname> <given-names>F.</given-names></name> <name><surname>Marquez-Lago</surname> <given-names>T. T.</given-names></name> <name><surname>Leier</surname> <given-names>A.</given-names></name> <name><surname>Revote</surname> <given-names>J.</given-names></name><etal/></person-group> (<year>2020</year>). <article-title>iLearn: an integrated platform and meta-learner for feature engineering, machine-learning analysis and modeling of DNA, RNA and protein sequence data.</article-title> <source><italic>Brief. Bioinform.</italic></source> <volume>21</volume> <fpage>1047</fpage>&#x2013;<lpage>1057</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbz041</pub-id> <pub-id pub-id-type="pmid">31067315</pub-id></citation></ref>
<ref id="B11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Choudhury</surname> <given-names>F. K.</given-names></name> <name><surname>Rivero</surname> <given-names>R. M.</given-names></name> <name><surname>Blumwald</surname> <given-names>E.</given-names></name> <name><surname>Mittler</surname> <given-names>R.</given-names></name></person-group> (<year>2017</year>). <article-title>Reactive oxygen species, abiotic stress and stress combination.</article-title> <source><italic>Plant J.</italic></source> <volume>90</volume> <fpage>856</fpage>&#x2013;<lpage>867</lpage>. <pub-id pub-id-type="doi">10.1111/tpj.13299</pub-id> <pub-id pub-id-type="pmid">27801967</pub-id></citation></ref>
<ref id="B12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Deng</surname> <given-names>L.</given-names></name> <name><surname>Xu</surname> <given-names>X.</given-names></name> <name><surname>Liu</surname> <given-names>H.</given-names></name></person-group> (<year>2018</year>). <article-title>PredCSO: an ensemble method for the prediction of S-sulfenylation sites in proteins.</article-title> <source><italic>Mol. Omics</italic></source> <volume>14</volume> <fpage>257</fpage>&#x2013;<lpage>265</lpage>. <pub-id pub-id-type="doi">10.1039/c8mo00089a</pub-id> <pub-id pub-id-type="pmid">29942948</pub-id></citation></ref>
<ref id="B13"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gupta</surname> <given-names>V.</given-names></name> <name><surname>Yang</surname> <given-names>J.</given-names></name> <name><surname>Liebler</surname> <given-names>D. C.</given-names></name> <name><surname>Carroll</surname> <given-names>K. S.</given-names></name></person-group> (<year>2017</year>). <article-title>Diverse redoxome reactivity profiles of carbon nucleophiles.</article-title> <source><italic>J. Am. Chem. Soc.</italic></source> <volume>139</volume> <fpage>5588</fpage>&#x2013;<lpage>5595</lpage>.</citation></ref>
<ref id="B14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hasan</surname> <given-names>M. M.</given-names></name> <name><surname>Guo</surname> <given-names>D. J.</given-names></name> <name><surname>Kurata</surname> <given-names>H.</given-names></name></person-group> (<year>2017</year>). <article-title>Computational identification of protein S-sulfenylation sites by incorporating the multiple sequence features information.</article-title> <source><italic>Mol. Biosyst.</italic></source> <volume>13</volume> <fpage>2545</fpage>&#x2013;<lpage>2550</lpage>. <pub-id pub-id-type="doi">10.1039/c7mb00491e</pub-id> <pub-id pub-id-type="pmid">28990628</pub-id></citation></ref>
<ref id="B15"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hourihan</surname> <given-names>J. M.</given-names></name> <name><surname>Moronetti Mazzeo</surname> <given-names>L. E.</given-names></name> <name><surname>Fernandez-Cardenas</surname> <given-names>L. P.</given-names></name> <name><surname>Blackwell</surname> <given-names>T. K.</given-names></name></person-group> (<year>2016</year>). <article-title>Cysteine sulfenylation directs IRE-1 to activate the SKN-1/Nrf2 antioxidant response.</article-title> <source><italic>Mol. Cell</italic></source> <volume>63</volume> <fpage>553</fpage>&#x2013;<lpage>566</lpage>. <pub-id pub-id-type="doi">10.1016/j.molcel.2016.07.019</pub-id> <pub-id pub-id-type="pmid">27540856</pub-id></citation></ref>
<ref id="B16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>J.</given-names></name> <name><surname>Willems</surname> <given-names>P.</given-names></name> <name><surname>Wei</surname> <given-names>B.</given-names></name> <name><surname>Tian</surname> <given-names>C.</given-names></name> <name><surname>Ferreira</surname> <given-names>R. B.</given-names></name> <name><surname>Bodra</surname> <given-names>N.</given-names></name><etal/></person-group> (<year>2019</year>). <article-title>Mining for protein S-sulfenylation in <italic>Arabidopsis</italic> uncovers redox-sensitive sites.</article-title> <source><italic>Proc. Natl. Acad. Sci. U.S.A.</italic></source> <volume>116</volume> <fpage>21256</fpage>&#x2013;<lpage>21261</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1906768116</pub-id> <pub-id pub-id-type="pmid">31578252</pub-id></citation></ref>
<ref id="B17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>J. J.</given-names></name> <name><surname>Willems</surname> <given-names>P.</given-names></name> <name><surname>Van Breusegem</surname> <given-names>F.</given-names></name> <name><surname>Messens</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <article-title>Pathways crossing mammalian and plant sulfenomic landscapes.</article-title> <source><italic>Free Radic. Biol. Med.</italic></source> <volume>122</volume> <fpage>193</fpage>&#x2013;<lpage>201</lpage>. <pub-id pub-id-type="doi">10.1016/j.freeradbiomed.2018.02.012</pub-id> <pub-id pub-id-type="pmid">29476921</pub-id></citation></ref>
<ref id="B18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>Y.</given-names></name> <name><surname>He</surname> <given-names>N.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>L.</given-names></name></person-group> (<year>2018</year>). <article-title>BERMP: a cross-species classifier for predicting m(6)A sites by integrating a deep learning algorithm and a random forest approach.</article-title> <source><italic>Int. J. Biol. Sci.</italic></source> <volume>14</volume> <fpage>1669</fpage>&#x2013;<lpage>1677</lpage>. <pub-id pub-id-type="doi">10.7150/ijbs.27819</pub-id> <pub-id pub-id-type="pmid">30416381</pub-id></citation></ref>
<ref id="B19"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jia</surname> <given-names>C.</given-names></name> <name><surname>Zuo</surname> <given-names>Y.</given-names></name></person-group> (<year>2017</year>). <article-title>S-SulfPred: a sensitive predictor to capture S-sulfenylation sites based on a resampling one-sided selection undersampling-synthetic minority oversampling technique.</article-title> <source><italic>J. Theor. Biol.</italic></source> <volume>422</volume> <fpage>84</fpage>&#x2013;<lpage>89</lpage>. <pub-id pub-id-type="doi">10.1016/j.jtbi.2017.03.031</pub-id> <pub-id pub-id-type="pmid">28411111</pub-id></citation></ref>
<ref id="B20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ju</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>S. Y.</given-names></name></person-group> (<year>2018</year>). <article-title>Prediction of S-sulfenylation sites using mRMR feature selection and fuzzy support vector machine algorithm.</article-title> <source><italic>J. Theoret. Biol.</italic></source> <volume>457</volume> <fpage>6</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1016/j.jtbi.2018.08.022</pub-id> <pub-id pub-id-type="pmid">30125576</pub-id></citation></ref>
<ref id="B21"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>R.</given-names></name> <name><surname>Klockenbusch</surname> <given-names>C.</given-names></name> <name><surname>Lin</surname> <given-names>L.</given-names></name> <name><surname>Jiang</surname> <given-names>H.</given-names></name> <name><surname>Lin</surname> <given-names>S.</given-names></name> <name><surname>Kast</surname> <given-names>J.</given-names></name></person-group> (<year>2016</year>). <article-title>Quantitative protein sulfenic acid analysis identifies platelet releasate-induced activation of integrin beta2 on monocytes via NADPH oxidase.</article-title> <source><italic>J. Proteome Res.</italic></source> <volume>15</volume> <fpage>4221</fpage>&#x2013;<lpage>4233</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jproteome.6b00212</pub-id> <pub-id pub-id-type="pmid">27690452</pub-id></citation></ref>
<ref id="B22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Men</surname> <given-names>L.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name></person-group> (<year>2007</year>). <article-title>The oxidation of yeast alcohol dehydrogenase-1 by hydrogen peroxide in vitro.</article-title> <source><italic>J. Proteome Res.</italic></source> <volume>6</volume> <fpage>216</fpage>&#x2013;<lpage>225</lpage>. <pub-id pub-id-type="doi">10.1021/pr0603809</pub-id> <pub-id pub-id-type="pmid">17203966</pub-id></citation></ref>
<ref id="B23"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mhamdi</surname> <given-names>A.</given-names></name> <name><surname>Van Breusegem</surname> <given-names>F.</given-names></name></person-group> (<year>2018</year>). <article-title>Reactive oxygen species in plant development.</article-title> <source><italic>Development</italic></source> <volume>145</volume>:<issue>dev164376</issue>. <pub-id pub-id-type="doi">10.1242/dev.164376</pub-id> <pub-id pub-id-type="pmid">30093413</pub-id></citation></ref>
<ref id="B24"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Paulsen</surname> <given-names>C. E.</given-names></name> <name><surname>Carroll</surname> <given-names>K. S.</given-names></name></person-group> (<year>2013</year>). <article-title>Cysteine-mediated redox signaling: chemistry, biology, and tools for discovery.</article-title> <source><italic>Chem. Rev.</italic></source> <volume>113</volume> <fpage>4633</fpage>&#x2013;<lpage>4679</lpage>. <pub-id pub-id-type="doi">10.1021/cr300163e</pub-id> <pub-id pub-id-type="pmid">23514336</pub-id></citation></ref>
<ref id="B25"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sakka</surname> <given-names>M.</given-names></name> <name><surname>Tzortzis</surname> <given-names>G.</given-names></name> <name><surname>Mantzaris</surname> <given-names>M. D.</given-names></name> <name><surname>Bekas</surname> <given-names>N.</given-names></name> <name><surname>Kellici</surname> <given-names>T. F.</given-names></name> <name><surname>Likas</surname> <given-names>A.</given-names></name><etal/></person-group> (<year>2016</year>). <article-title>PRESS: PRotEin S-Sulfenylation server.</article-title> <source><italic>Bioinformatics</italic></source> <volume>32</volume> <fpage>2710</fpage>&#x2013;<lpage>2712</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btw301</pub-id> <pub-id pub-id-type="pmid">27187205</pub-id></citation></ref>
<ref id="B26"><citation citation-type="journal"><collab>UniProt Comstortium</collab> (<year>2011</year>). <article-title>Ongoing and future developments at the universal protein resource.</article-title> <source><italic>Nucleic Acids Res.</italic></source> <volume>39</volume> <fpage>D214</fpage>&#x2013;<lpage>D219</lpage>.</citation></ref>
<ref id="B27"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vacic</surname> <given-names>V.</given-names></name> <name><surname>Iakoucheva</surname> <given-names>L. M.</given-names></name> <name><surname>Radivojac</surname> <given-names>P.</given-names></name></person-group> (<year>2006</year>). <article-title>Two sample logo: a graphical representation of the differences between two sets of sequence alignments.</article-title> <source><italic>Bioinformatics</italic></source> <volume>22</volume> <fpage>1536</fpage>&#x2013;<lpage>1537</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btl151</pub-id> <pub-id pub-id-type="pmid">16632492</pub-id></citation></ref>
<ref id="B28"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Van Breusegem</surname> <given-names>F.</given-names></name> <name><surname>Dat</surname> <given-names>J. F.</given-names></name></person-group> (<year>2006</year>). <article-title>Reactive oxygen species in plant cell death.</article-title> <source><italic>Plant Physiol.</italic></source> <volume>141</volume> <fpage>384</fpage>&#x2013;<lpage>390</lpage>.</citation></ref>
<ref id="B29"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>van der Maaten</surname> <given-names>L.</given-names></name> <name><surname>Hinton</surname> <given-names>G.</given-names></name></person-group> (<year>2008</year>). <article-title>Visualizing data using t-SNE.</article-title> <source><italic>J. Mach. Learn. Res.</italic></source> <volume>9</volume> <fpage>2579</fpage>&#x2013;<lpage>2605</lpage>.</citation></ref>
<ref id="B30"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Weerapana</surname> <given-names>E.</given-names></name> <name><surname>Blewett</surname> <given-names>M. M.</given-names></name> <name><surname>Cravatt</surname> <given-names>B. F.</given-names></name></person-group> (<year>2014</year>). <article-title>A chemoproteomic platform to quantitatively map targets of lipid-derived electrophiles.</article-title> <source><italic>Nat. Methods</italic></source> <volume>11</volume> <fpage>79</fpage>&#x2013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.1038/nmeth.2759</pub-id> <pub-id pub-id-type="pmid">24292485</pub-id></citation></ref>
<ref id="B31"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>D.</given-names></name> <name><surname>Zeng</surname> <given-names>S.</given-names></name> <name><surname>Xu</surname> <given-names>C.</given-names></name> <name><surname>Qiu</surname> <given-names>W.</given-names></name> <name><surname>Liang</surname> <given-names>Y.</given-names></name> <name><surname>Joshi</surname> <given-names>T.</given-names></name><etal/></person-group> (<year>2017</year>). <article-title>MusiteDeep: a deep-learning framework for general and kinase-specific phosphorylation site prediction.</article-title> <source><italic>Bioinformatics</italic></source> <volume>33</volume> <fpage>3909</fpage>&#x2013;<lpage>3916</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btx496</pub-id> <pub-id pub-id-type="pmid">29036382</pub-id></citation></ref>
<ref id="B32"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>L.</given-names></name> <name><surname>Zhang</surname> <given-names>R.</given-names></name> <name><surname>Mu</surname> <given-names>Y.</given-names></name></person-group> (<year>2019</year>). <article-title>Fu-SulfPred: identification of protein S-sulfenylation sites by fusing forests via chou&#x2019;s general PseAAC.</article-title> <source><italic>J. Theor. Biol.</italic></source> <volume>461</volume> <fpage>51</fpage>&#x2013;<lpage>58</lpage>. <pub-id pub-id-type="doi">10.1016/j.jtbi.2018.10.046</pub-id> <pub-id pub-id-type="pmid">30365947</pub-id></citation></ref>
<ref id="B33"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Yan</surname> <given-names>R.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Song</surname> <given-names>J.</given-names></name></person-group> (<year>2016</year>). <article-title>SOHPRED: a new bioinformatics tool for the characterization and prediction of human S-sulfenylation sites.</article-title> <source><italic>Mol. Biosyst.</italic></source> <volume>12</volume> <fpage>2849</fpage>&#x2013;<lpage>2858</lpage>. <pub-id pub-id-type="doi">10.1039/c6mb00314a</pub-id> <pub-id pub-id-type="pmid">27364688</pub-id></citation></ref>
<ref id="B34"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xie</surname> <given-names>Y.</given-names></name> <name><surname>Luo</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>L.</given-names></name> <name><surname>Ma</surname> <given-names>W.</given-names></name> <name><surname>Huang</surname> <given-names>J.</given-names></name><etal/></person-group> (<year>2018</year>). <article-title>DeepNitro: prediction of protein nitration and nitrosylation sites by deep learning.</article-title> <source><italic>Genomics Proteomics Bioinform.</italic></source> <volume>16</volume> <fpage>294</fpage>&#x2013;<lpage>306</lpage>. <pub-id pub-id-type="doi">10.1016/j.gpb.2018.04.007</pub-id> <pub-id pub-id-type="pmid">30268931</pub-id></citation></ref>
<ref id="B35"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Y.</given-names></name> <name><surname>Ding</surname> <given-names>J.</given-names></name> <name><surname>Wu</surname> <given-names>L. Y.</given-names></name></person-group> (<year>2016</year>). <article-title>iSulf-Cys: prediction of S-sulfenylation sites in proteins with physicochemical properties of amino acids.</article-title> <source><italic>PLoS One</italic></source> <volume>11</volume>:<issue>e0154237</issue>. <pub-id pub-id-type="doi">10.1371/journal.pone.0154237</pub-id> <pub-id pub-id-type="pmid">27104833</pub-id></citation></ref>
<ref id="B36"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>J.</given-names></name> <name><surname>Gupta</surname> <given-names>V.</given-names></name> <name><surname>Carroll</surname> <given-names>K. S.</given-names></name> <name><surname>Liebler</surname> <given-names>D. C.</given-names></name></person-group> (<year>2014</year>). <article-title>Site-specific mapping and quantification of protein S-sulphenylation in cells.</article-title> <source><italic>Nat. Commun.</italic></source> <volume>5</volume>:<issue>4776</issue>.</citation></ref>
<ref id="B37"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Fang</surname> <given-names>C.</given-names></name> <name><surname>Yuan</surname> <given-names>W.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Yan</surname> <given-names>G.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name><etal/></person-group> (<year>2019</year>). <article-title>Selective identification and site-specific quantification of 4-Hydroxy-2-nonenal-modified proteins.</article-title> <source><italic>Anal. Chem.</italic></source> <volume>91</volume> <fpage>5235</fpage>&#x2013;<lpage>5243</lpage>. <pub-id pub-id-type="doi">10.1021/acs.analchem.8b05970</pub-id> <pub-id pub-id-type="pmid">30892874</pub-id></citation></ref>
<ref id="B38"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Xie</surname> <given-names>R.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Leier</surname> <given-names>A.</given-names></name> <name><surname>Marquez-Lago</surname> <given-names>T. T.</given-names></name> <name><surname>Akutsu</surname> <given-names>T.</given-names></name><etal/></person-group> (<year>2019</year>). <article-title>Computational analysis and prediction of lysine malonylation sites by exploiting informative features in an integrative machine-learning framework.</article-title> <source><italic>Brief. Bioinform.</italic></source> <volume>20</volume> <fpage>2185</fpage>&#x2013;<lpage>2199</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bby079</pub-id> <pub-id pub-id-type="pmid">30351377</pub-id></citation></ref>
<ref id="B39"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>Y.</given-names></name> <name><surname>He</surname> <given-names>N.</given-names></name> <name><surname>Chen</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>L.</given-names></name></person-group> (<year>2020</year>). <article-title>Identification of protein lysine crotonylation sites by a deep learning framework with convolutional neural networks.</article-title> <source><italic>IEEE Access.</italic></source> <volume>8</volume> <fpage>14244</fpage>&#x2013;<lpage>14252</lpage>. <pub-id pub-id-type="doi">10.1109/access.2020.2966592</pub-id></citation></ref>
</ref-list>
<fn-group>
<fn id="footnote1">
<label>1</label>
<p><ext-link ext-link-type="uri" xlink:href="http://www.genome.jp/aaindex/">http://www.genome.jp/aaindex/</ext-link></p></fn>
</fn-group>
</back>
</article>