<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Chem.</journal-id>
<journal-title>Frontiers in Chemistry</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Chem.</abbrev-journal-title>
<issn pub-type="epub">2296-2646</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1059593</article-id>
<article-id pub-id-type="doi">10.3389/fchem.2022.1059593</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Chemistry</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Modelling eNvironment for Isoforms (MoNvIso): A general platform to predict structural determinants of protein isoforms in genetic diseases</article-title>
<alt-title alt-title-type="left-running-head">Oliva et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fchem.2022.1059593">10.3389/fchem.2022.1059593</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Oliva</surname>
<given-names>Francesco</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Musiani</surname>
<given-names>Francesco</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2100716/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Giorgetti</surname>
<given-names>Alejandro</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/472639/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>De Rubeis</surname>
<given-names>Silvia</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<xref ref-type="aff" rid="aff7">
<sup>7</sup>
</xref>
<xref ref-type="aff" rid="aff8">
<sup>8</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/594712/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sorokina</surname>
<given-names>Oksana</given-names>
</name>
<xref ref-type="aff" rid="aff9">
<sup>9</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/46187/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Armstrong</surname>
<given-names>Douglas J.</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff9">
<sup>9</sup>
</xref>
<xref ref-type="aff" rid="aff10">
<sup>10</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1858/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Carloni</surname>
<given-names>Paolo</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff11">
<sup>11</sup>
</xref>
<xref ref-type="aff" rid="aff12">
<sup>12</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/464339/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Ruggerone</surname>
<given-names>Paolo</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/132692/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Physics</institution>, <institution>University of Cagliari</institution>, <addr-line>Monserrato (CA)</addr-line>, <country>Italy</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Institute of Neuroscience and Medicine INM-9</institution>, <institution>Institute for Advanced Simulations IAS-5</institution>, <institution>Forschungszentrum J&#xfc;lich</institution>, <addr-line>J&#xfc;lich</addr-line>, <country>Germany</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Laboratory of Bioinorganic Chemistry</institution>, <institution>Department of Pharmacy and Biotechnology</institution>, <institution>University of Bologna</institution>, <addr-line>Bologna</addr-line>, <country>Italy</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Department of Biotechnology</institution>, <institution>University of Verona</institution>, <addr-line>Verona</addr-line>, <country>Italy</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Seaver Autism Center for Research and Treatment</institution>, <institution>Icahn School of Medicine at Mount Sinai</institution>, <addr-line>New York</addr-line>, <addr-line>NY</addr-line>, <country>United States</country>
</aff>
<aff id="aff6">
<sup>6</sup>
<institution>Department of Psychiatry</institution>, <institution>Icahn School of Medicine at Mount Sinai</institution>, <addr-line>New York</addr-line>, <addr-line>NY</addr-line>, <country>United States</country>
</aff>
<aff id="aff7">
<sup>7</sup>
<institution>The Mindich Child Health and Development Institute</institution>, <institution>Icahn School of Medicine at Mount Sinai</institution>, <addr-line>New York</addr-line>, <addr-line>NY</addr-line>, <country>United States</country>
</aff>
<aff id="aff8">
<sup>8</sup>
<institution>Friedman Brain Institute</institution>, <institution>Icahn School of Medicine at Mount Sinai</institution>, <addr-line>New York</addr-line>, <addr-line>NY</addr-line>, <country>United States</country>
</aff>
<aff id="aff9">
<sup>9</sup>
<institution>The School of Informatics</institution>, <institution>University of Edinburgh</institution>, <addr-line>Edinburgh</addr-line>, <country>United Kingdom</country>
</aff>
<aff id="aff10">
<sup>10</sup>
<institution>Simons Initiative for the Developing Brain</institution>, <institution>University of Edinburgh</institution>, <addr-line>Edinburgh</addr-line>, <country>United Kingdom</country>
</aff>
<aff id="aff11">
<sup>11</sup>
<institution>Department of Physics</institution>, <institution>RWTH Aachen University</institution>, <addr-line>Aachen</addr-line>, <country>Germany</country>
</aff>
<aff id="aff12">
<sup>12</sup>
<institution>JARA-Institute: Molecular Neuroscience and Neuroimaging</institution>, <institution>Institute for Neuroscience and Medicine INM-11/JARA-BRAIN Institute JBI-2</institution>, <institution>Forschungszentrum J&#xfc;lich GmbH</institution>, <addr-line>J&#xfc;lich</addr-line>, <country>Germany</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/122658/overview">Sergio Pantano</ext-link>, Institut Pasteur de Montevideo, Uruguay</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/289423/overview">Paolo A. Calligari</ext-link>, University of Rome Tor Vergata, Italy</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1786509/overview">Durba Sengupta</ext-link>, National Chemical Laboratory (CSIR), India</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Paolo Ruggerone, <email>paolo.ruggerone@dsf.unica.it</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Theoretical and Computational Chemistry, a section of the journal Frontiers in Chemistry</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>09</day>
<month>01</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>10</volume>
<elocation-id>1059593</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>10</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>06</day>
<month>12</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Oliva, Musiani, Giorgetti, De Rubeis, Sorokina, Armstrong, Carloni and Ruggerone.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Oliva, Musiani, Giorgetti, De Rubeis, Sorokina, Armstrong, Carloni and Ruggerone</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>The seamless integration of human disease-related mutation data into protein structures is an essential component of any attempt to correctly assess the impact of the mutation. The key step preliminary to any structural modelling is the identification of the isoforms onto which mutations should be mapped due to there being several functionally different protein isoforms from the same gene. To handle large sets of data coming from omics techniques, this challenging task needs to be automatized. Here we present the MoNvIso (Modelling eNvironment for Isoforms) code, which identifies the most useful isoform for computational modelling, balancing the coverage of mutations of interest and the availability of templates to build a structural model of both the wild-type isoform and the related variants.</p>
</abstract>
<kwd-group>
<kwd>isoform identification</kwd>
<kwd>mutations</kwd>
<kwd>molecular modelling</kwd>
<kwd>proteins</kwd>
<kwd>diseases</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>The spatial and functional diversity of the 20,465 protein-coding genes (<xref ref-type="bibr" rid="B6">Howe et al., 20212021</xref>) (<ext-link ext-link-type="uri" xlink:href="https://www.ensembl.org/">https://www.ensembl.org/</ext-link>) in the human genome is dramatically augmented through alternative splicing that results in an enormous number of potential protein isoforms. Exact numbers are not fully known but common estimates for total isoforms are in the 10X range (245,000 transcripts in <ext-link ext-link-type="uri" xlink:href="https://www.ensembl.org/">https://www.ensembl.org/</ext-link>). Alternative splicing can result in isoforms with relatively subtle changes through to those that vary enormously in their structure, function, and subcellular spatial expression (<xref ref-type="bibr" rid="B19">Park et al., 2018</xref>).</p>
<p>Indeed, most functional (and dysfunctional) biochemical processes are affected by the expressed isoforms, which feature distinct functional roles. Examples of this complexity include the neuroligin and neurexin families, which perform synaptic regulatory functions that are surprisingly isoform specific (<xref ref-type="bibr" rid="B14">Markwick et al., 2007</xref>; <xref ref-type="bibr" rid="B28">Slabinski et al., 2007</xref>). This complexity may be increased by the addition of genetic variants, which can directly influence the protein structure and function of the isoform. Moreover, genetic variations can also affect the splice mechanisms and change the isoforms directly (<xref ref-type="bibr" rid="B19">Park et al., 2018</xref>), but this is not addressed in this study.</p>
<p>Further information, key to our understanding of genetic diseases, is the availability of three-dimensional structures of a protein. The structure of many human proteins is now available by accurate - yet time-consuming (<xref ref-type="bibr" rid="B14">Markwick et al., 2007</xref>; <xref ref-type="bibr" rid="B28">Slabinski et al., 2007</xref>) - experimental techniques (such as X-ray diffraction, NMR and electron microscopy (<xref ref-type="bibr" rid="B17">Murata and Wolf, 2018</xref>)). These accurate but demanding approaches are complemented by fast (and more approximate) computational predictions (<xref ref-type="bibr" rid="B11">Kuhlman and Bradley, 2019</xref>), including homology modelling (<xref ref-type="bibr" rid="B11">Kuhlman and Bradley, 2019</xref>) and deep learning techniques such as AlphaFold (AF) (<xref ref-type="bibr" rid="B29">Tunyasuvunakool et al., 2021</xref>), based on experimental structural information of evolutionarily related template protein(s) (<xref ref-type="bibr" rid="B11">Kuhlman and Bradley, 2019</xref>). Unfortunately, all these methods do not usually provide the isoforms most likely involved in the process of interest.</p>
<p>Here we present a computational platform that selects specifically the most useful isoform for molecular modelling and provides structural information, in the context of identified genetic variants. The presence of a variable number of protein isoforms makes it challenging to assign each mutation to a specific position in the protein sequence, which frequently hampers a reliable assessment of the impact of the genetic variations (including disease relevant mutations (<xref ref-type="bibr" rid="B22">Rees et al., 2010</xref>; <xref ref-type="bibr" rid="B9">Kato et al., 2018</xref>)) on an isoform suitable for molecular modelling. In other cases, a mutation is observed that is relevant to a specific isoform, but the databases reporting mutations related to a particular genetic disease usually lack a reference to the specific isoform.</p>
<p>Given a set of mutations at the protein expression level, our pipeline can correctly assign them to the corresponding isoforms at the protein level, providing important information that can be used for further investigations. The second key step of the determination of the isoform most useful for molecular modelling is achieved by combining the mutation-isoform map with the sequence coverage of available structural templates.</p>
</sec>
<sec id="s2">
<title>2 The MoNvIso (Modelling eNvironment for Isoforms) pipeline</title>
<p>The general workflow of MoNvIso is summarised in <xref ref-type="fig" rid="F1">Figure 1</xref> and proceeds according to three steps described in more details in the next subsections:<list list-type="simple">
<list-item>
<p>1) Step 1: check of the gene names provided in the input file, identification of canonical and additional isoforms extracted from the Uniprot database. In the input file a list of the mutations of interest is also present.</p>
</list-item>
<list-item>
<p>2) Step 2: check of the modelling propensity and how properly mutations are mapped on the available isoforms. The availability of templates is supervised by MoNvIso, as well as the association of the mutations to the appropriate isoforms. MoNvIso highlights failures in this mapping procedure, i.e., when mutations cannot be mapped on any available isoforms.</p>
</list-item>
<list-item>
<p>3) Step 3: Building of the structural model of the identified proteins. Model of the wild-type (WT) forms and of their variants (selected by MoNvIso according to Step 2) are built if experimental structures are not already available for the selected isoforms.</p>
</list-item>
</list>
</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>MoNvIso&#x2019;s flowchart.</p>
</caption>
<graphic xlink:href="fchem-10-1059593-g001.tif"/>
</fig>
<p>The selection procedure is based on a function, named <bold>
<italic>Selection</italic>
</bold>, (Step 2) that casts two contributions as follows:<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>w</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#x22c5;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>w</mml:mi>
<mml:mn>2</mml:mn>
<mml:mo>&#x22c5;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>The two terms, <bold>
<italic>Structural function</italic>
</bold> and <bold>
<italic>Mutation function</italic>
</bold> numerically translate the modelling propensity and the mapping of the mutations on the available isoforms to accomplish the two conditions. <italic>w1</italic> and <italic>w2</italic> are the weights of two terms. By default, <italic>w1</italic> &#x3d; <italic>w2</italic> &#x3d; 10 but they can be adjusted by the user. <bold>
<italic>Structural function</italic>
</bold> and <bold>
<italic>Mutation function</italic>
</bold> are described more in detail in the Subsection Step 2.</p>
<p>Collections of input and output files for the proteins KRAS and KDM5C are collected in example_p1. rar and example_p2. rar, which can be downloaded at <ext-link ext-link-type="uri" xlink:href="https://github.com/MoNvIsoModeling/MoNvIso">https://github.com/MoNvIsoModeling/MoNvIso</ext-link>.</p>
<sec id="s2-1">
<title>2.1 Step 1</title>
<p>MoNvIso checks the list of gene names and the set of point mutations provided by the user. The mutations can be indicated in the input file according to different formats: three-letters or single letter names for the amino acids. Additionally, spaces and tabs are also accepted to simplify the creation of the list by the user. Every gene name is searched against the Uniprot (<xref ref-type="bibr" rid="B3">Bateman et al., 2021</xref>) database, the results are extracted from two files, namely <italic>uniprot_sprot.fasta</italic>, which contains the aminoacidic sequence of the canonical isoforms according to the classification of Uniprot, and <italic>uniprot_sprot_varsplic.fasta</italic> collecting the sequences of the remaining isoforms obtained from Uniprot (see <xref ref-type="sec" rid="s13">Supplementary Figure S1</xref> for the list of folders and files created by MoNvIso) .</p>
</sec>
<sec id="s2-2">
<title>2.2 Step 2</title>
<p>MoNvIso then performs an analysis on each isoform extracted from the Uniprot entry (see Step 1) based on two functions: 1) checking the modelling propensity and 2) mapping of the mutations. A score is associated with each function and the combination of the two is used to select the isoform most suitable to be modelled. Independently on the chosen isoform to be modelled, the information on the mapped mutations onto all the isoforms is provided by MoNvIso. In detail:</p>
<sec id="s2-2-1">
<title>2.2.1 Checking the modelling propensity.</title>
<p>Each isoform is then processed according to a standard procedure: A search for homologous sequences is performed using BLAST API (<xref ref-type="bibr" rid="B1">Altschul et al., 1990</xref>), which allows users to submit BLAST searches for processing through cloud service provider(s) using HTTPS; and a multi sequence alignment (MSA) is generated using COBALT (<xref ref-type="bibr" rid="B18">Papadopoulos and Agarwala, 2007</xref>). Subsequently, based on the MSA, the hmmsearch function of HMMER (version 3.3.2 <ext-link ext-link-type="uri" xlink:href="http://hmmer.org/">http://hmmer.org/</ext-link>) uses the HMM (Hidden Markov Model) (<xref ref-type="bibr" rid="B4">Baum and Petrie, 1966</xref>) to find relevant templates in the PDB. The 10 most similar sequences for the identified PDB structures are downloaded and the chains necessary for the homology modelling are extracted as separate files. The extracted structures are cleaned from water molecules, ligands, disordered atoms, and non-standard residues, then aligned to the MSA and are made available to the user in a folder (see <xref ref-type="sec" rid="s13">Supplementary Figure S1</xref>).</p>
<p>The resulting structures are ranked by resolution and sequence identity to find the most appropriate templates, thus excluding crystals with poor resolution or with sequences that are very different from the original query (see Section Limitations). The default values of the sequence identity and resolution thresholds are 25% and 4.5&#xa0;&#xc5;, respectively. However, the thresholds can be modified by the user. A further selection criterion is applied by calculating the coverage of the input sequence by the sequences of the templates. To this aim, MoNvIso identifies the minimum number of templates necessary to model the highest percentage of the target sequence. For a given target sequence (for example, Isoform 1 &#x3d; ADRRVLTY) and the set of templates identified as described above (for example, Template A: AD, Template B: AD, Template C: RRVLT, Template D: DRR), MoNvIso proceeds as follows:<list list-type="simple">
<list-item>
<p>1) Sorting of the templates according to the covered lengths, in our case Templates A, B, D, C;</p>
</list-item>
<list-item>
<p>2) Checking if the given sequence is covered by more than one template or by a combination of templates. In our case, Templates A and B cover the same portion;</p>
</list-item>
<list-item>
<p>3) If a single template covers the target, then this template is considered (which is not the case of our example);</p>
</list-item>
<list-item>
<p>4) If the target is covered either by a longer template or by a combination of other templates (with at least one covering extra portions of the protein), the proper selection is considered. In our example, this is accomplished by the combination of Templates A and C, being the choice between Templates A and B only dictated by the alphabetical order.</p>
</list-item>
</list>
</p>
<p>The described procedure is applied by MoNvIso to entire sequences or portions of them and to all the possible additional isoforms (our example deals with a second isoform, Isoform 2 &#x3d; ADRKVLTY). Note that information about covered sections and associated templates are stored in the <italic>covered_intervals</italic> file produced by MoNvIso.</p>
<p>Starting from the above description, the term <bold>
<italic>Structural function</italic>
</bold> in Eq. <xref ref-type="disp-formula" rid="e1">1</xref>, accounts for the availability of crystallographic data defined as the number of amino acids (AAs) that are covered by a template (or a combination of templates) over the total number of AAs constituting the isoform<disp-formula id="e2">
<mml:math id="m2">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>A</mml:mi>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>A</mml:mi>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>In the above example, for Isoform 1 we have <bold>
<italic>Total AA</italic>
</bold> &#x3d; 8 and <bold>
<italic>Covered AA</italic>
</bold> &#x3d; 7, resulting in a <bold>
<italic>Structural function</italic>
</bold> &#x3d; 0.875, while for Isoform 2 the values of <bold>
<italic>Covered AA</italic>
</bold> and <bold>
<italic>Structural function</italic>
</bold> are 6 and 0.750, respectively.</p>
</sec>
<sec id="s2-2-2">
<title>2.2.2 Mapping of the mutations</title>
<p>The second term of Eq. <xref ref-type="disp-formula" rid="e1">1</xref>, <bold>
<italic>Mutation Function</italic>
</bold>, considers the entire list of mutations provided for the considered gene, thus pinpointing to the isoform most suitable for homology modelling. Our program maps all mutations onto the appropriate isoform and increases by one the numerator, <bold>
<italic>Mutating AA that can be modelled</italic>
</bold>, if the mutated residue can be correctly located in the isoform sequence. The contribution of matched mutations to the selection function is evaluated as follows:<disp-formula id="e3">
<mml:math id="m3">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>A</mml:mi>
<mml:mi>A</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>b</mml:mi>
<mml:mi>e</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>g</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>A</mml:mi>
<mml:mi>A</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>f</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>According to our example, for the three mutations T2A, R3A, R4L, MoNvIso highlights that the first mutation T2A is not mapped on the two present isoforms, while it evaluates <bold>
<italic>Mutating AA that can be modelled</italic>
</bold> equal to two and one for Isoforms 1 and 2, respectively. <bold>
<italic>Mutating AA found in at least one isoform</italic>
</bold> is two for both isoforms, <bold>
<italic>Mutation function</italic>
</bold> (Isoform 1) &#x3d; 1, and <bold>
<italic>Mutation function</italic>
</bold> (Isoform 2) &#x3d; 0.5.</p>
<p>For each gene and each isoform, the resulting <bold>
<italic>Selections</italic>
</bold> are reported in the <italic>report.</italic> log file. Moreover, this file contains a report on all mutations inserted in the input file, that is, i) the mapped mutations, ii) on which isoform they were mapped and iii) mutations not associated with any isoforms, together with iv) the isoform most suitable to be modelled (see <xref ref-type="sec" rid="s13">Supplementary Figure S2</xref>). In our example, the selected isoform to be modelled is Isoform 1 with <bold>
<italic>Selection</italic>
</bold> &#x3d; 18.75.</p>
</sec>
</sec>
<sec id="s2-3">
<title>2.3 Step 3</title>
<p>Structural models for the selected isoform in its WT form and in all the variant(s) associated with the properly mapped mutation(s) are then created by using the MODELLER program (<xref ref-type="bibr" rid="B32">Webb and Sali, 2016</xref>) based on the sequence alignment obtained in the previous step. Regions not covered by the templates are not considered. The models are then ranked by the DOPE score (<xref ref-type="bibr" rid="B27">Shen and Sali, 2006</xref>), and MoNvIso yields the top ranked one (the list of all the models with their DOPE score is in the file MYOUT. dat, see SI for the list of all the files generated by MoNvIso and their location). The modelling of the variants is then performed by taking the MODELLER input file containing the WT sequences of the templates and replacing the mutated AAs in the sequence. MODELLER is then run again to produce the model of the variant(s). This can be useful for mapping the position of mutations on a three-dimensional structure, allowing the study not only of the mutated residue but also of the amino acids in its vicinity and with which the mutated residue may be in contact.</p>
</sec>
</sec>
<sec id="s3">
<title>3 Strengths</title>
<p>Our pipeline exploits a series of tools tailored to manage large sets of proteins. Useful information is provided at each step of the run so that decisions taken by the pipeline can be audited. In the case of a failure of the pipeline to provide a satisfactory structural model, the file <italic>report. csv</italic> traces the mutations on all the isoforms and provides an easy way to identify the isoform mapping the largest number of mutations. The previously mentioned <italic>report</italic>. <italic>log</italic> file is also important. This file contains all the data that would otherwise have to be manually collected such as the number of isoforms for a gene, the location of the mutations, which mutations cannot be mapped on any known isoform and finally the values of the selection functions. These data can provide a useful starting point if the user needs to manually model the protein. For example, the user, upon data retrieval, can also decide if another isoform should be prioritised because of a mutation of particular interest not present in the isoform selected by the program. Regarding the modelling part of the protocol, the final alignments, the used templates with detailed information on the selection process as well as the coverage are made available to the user, as specified thoroughly in <xref ref-type="sec" rid="s2">Section 2</xref>. Although the process of building the variants can be time consuming if many of them need to be built, this part is fully automated. In most of the tested cases the models built showed a high quality and can be used for further studies (see Section Results). Thus, our pipeline reduces the time necessary to model a large number of proteins by automating the slowest parts of the process including the search for isoforms, the mapping of mutations, the search for crystallographic data to use as templates and the building of the alignments.</p>
</sec>
<sec id="s4">
<title>4 Limitations</title>
<p>As with any modelling study, also our method presents limitations. MoNvIso does not model the parts of the protein that are not covered by templates. The solution implemented in the program is the modelling of the single domains, although this implies the uncertainty on reciprocal orientations of the domains. An additional drawback is the possible presence of several small portions that can be modelled but are interspersed by regions not covered by templates. In some cases, the search for templates with HMMER does not return any result (depends on HMMER&#x2019;s servers). When several successive searches for homologues are queued on BLAST, a slowdown of the runs may occur. Multiple point mutations coexisting on the same proteins are not modelled by MoNvIso concurrently. Rather, MoNvIso provides a series of structural models of single amino acid variants for pairwise comparison. Finally, MoNvIso selects the most useful isoform based on available structural data and mutation coverage but there is no guarantee this is the most functionally relevant one in every case.</p>
</sec>
<sec id="s5">
<title>5 Case studies</title>
<p>We tested MoNvIso on a set of 70 proteins. A corresponding 257 human isoforms were extracted from the Uniprot database and relative mutations obtained from the relative Uniprot webpage, with a maximum cap of five mutations per protein. The genes and mutations considered are listed in the file <italic>mutations.txt</italic> provided in Supporting Materials. For all selected proteins MoNvIso was able to produce the alignments and to map the mutations onto the identified isoform. It successfully located, retrieved, and edited the templates to generate the WT structural models as well as the variants, when the identified mutations were in the modelled portions.</p>
<p>Out of the 70 proteins we modelled, 53 WT models could be compared against equivalent ones available in the AF database (DB) (<ext-link ext-link-type="uri" xlink:href="https://alphafold.ebi.ac.uk/">https://alphafold.ebi.ac.uk/</ext-link>). This was done by extracting from the AF model the part of the sequence that we modelled and performing an RMSD analysis on the C&#x03B1;.</p>
<p>For the remaining 17 proteins (BCL11A, CACNA1B, CAMKK1, CAMKK2, DNMT1, FMR1, GABRB3, GRIK2, GRM5, PLXNB1, SCN2A, SLC17A8, SNAP25, STX1A, SYN1, SYT1, TAF1), such comparison was not feasible because the isoform selected by MoNvIso was not the canonical one as considered by AF and was not sufficiently similar for direct comparison, i.e. the number of C&#x03B1; was different. For a further 13 proteins out of 70 we modelled an isoform different from the canonical sequence but the RMSD comparison with the AF models was possible because the changes were localised in region not covered by templates.</p>
<p>Thus, for a total of 30 proteins out of 70 mutations are best modelled on non-canonical isoforms. The results of the comparison are presented in <xref ref-type="sec" rid="s13">Supplementary Table S1</xref> together with the amount of residue for which AF has a high or very high confidence (pLDDT score &#x3e;70) about their position. The genes are ordered from the one with lowest RMSD value to the highest. According to <xref ref-type="sec" rid="s13">Supplementary Table S1</xref>, 44 out of 57 (77%) models present an RMSD below 20&#xa0;&#xc5;, and a visual inspection reinforces the validity of our results, since the larger RMSD values in this group are mainly due to small, disordered loops. In the group of models with RMSD above 20 there are subunits assuming different orientations in both MoNvIso and AF structures. When comparing the number of AA with a high or, very high, confidence score, we see that in most of our results (46 out of 57), the modelled portion retains at least 50% of these residues.</p>
<p>As an example, we show two structures in <xref ref-type="fig" rid="F2">Figure 2</xref>: the proteins GRIN1 (Glutamate receptor ionotropic, NMDA one; also known as GluN1; Uniprot &#x23;Q05586) and GRIN2B (Glutamate receptor ionotropic, NMDA one; also known as GluN2B; Uniprot &#x23;Q13224). These two transmembrane proteins are subunits of the N-methyl-<sc>d</sc>-aspartate (NMDA) glutamate receptor complex, which contribute to excitatory transmission in the brain. In the first case both AF and MoNvIso produce similar results that differ only in the domains for which no templates are available, but still modelled by AF. Examples of these domains are the C-terminal part, starting from K866 to S938 and the N-terminal helix (residues M1 to D23) that are modelled by AF and not by MoNvIso (see top left and bottom right in <xref ref-type="fig" rid="F2">Figure 2A</xref>, respectively). These two portions of the sequence are not considered by MoNvIso (see Step 3) since there are no available templates to correctly model them, but AF does attempt to model the whole chain. This leads to portions of the model with low or very low confidence scores (calculated by AF), and which corresponds to a pLDDT between 0 and 70, meaning that those parts of the model are generally unreliable.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Comparison between the ribbon representations of GRIN1 <bold>(A)</bold> and GRIN2B <bold>(B)</bold> model structures generated with AF (left panels) and MoNvIso (right panels). The ribbons are colored from blue to red going from the N- to the C-terminal.</p>
</caption>
<graphic xlink:href="fchem-10-1059593-g002.tif"/>
</fig>
<p>The results for GRIN2B (see <xref ref-type="fig" rid="F2">Figure 2B</xref>) demonstrate the differences between AF and MoNvIso predictions. AF successfully models the N-terminal part of the protein but fails to correctly build the trans and intra-membrane domains, which are then added as loops twisted around the correctly modelled section of the protein. Once again, the portions that are missing from the PDB database are poorly modelled. Since AF has been trained on the PDB dataset (<xref ref-type="bibr" rid="B29">Tunyasuvunakool et al., 2021</xref>), it still relies on available crystallographic data to correctly model structures. Thus, transmembrane domains such as those of GRIN2B, which are underrepresented in that training set because of the scarcity of experimentally determined structures of transmembrane proteins and their complexes (<xref ref-type="bibr" rid="B10">Kermani, 2021</xref>), may fail to be correctly built. In turn, MoNvIso automatically recognises the parts of the protein that can be modelled with confidence. As a result, MoNvIso cuts out of the sequence the extra AAs that cannot be modelled, producing a model ready to be used for further analysis.</p>
</sec>
<sec sec-type="conclusion" id="s6">
<title>6 Conclusion</title>
<p>Dissecting the impact of point mutations in the function of a protein are often hindered by a lack of an appropriate mapping of the mutation onto the correct isoform of a protein, of the identification of isoform(s) useful for molecular modelling, and of the associated building of a reliable structure. This knowledge is important because different isoforms of proteins can have widely differing functional roles and spatio-temporal expression profiles. As genomic variants associated with human traits and/or disease are being discovered at an increasing rate, approaches to link them to isoforms and find reliable structural models are urgently needed. MoNvIso addresses these two aspects: mapping a set of point mutations (provided by the user) on known isoforms, along with selecting the isoform most suitable to be modelled. The prediction of the structural models for the WT isoforms and their variants is automated, making MoNvIso appropriate for high-throughput investigations. Although several platforms to provide accurate structures of a protein are available and routinely used (<xref ref-type="bibr" rid="B33">Yang et al., 2014</xref>; <xref ref-type="bibr" rid="B32">Webb and Sali, 2016</xref>; <xref ref-type="bibr" rid="B31">Waterhouse et al., 2018</xref>), surprisingly few of them can be implemented in a pipeline (<xref ref-type="bibr" rid="B32">Webb and Sali, 2016</xref>) to automate the modelling of multiple different proteins. Therefore, our protocol combines this final step with the key preliminary assessment of the isoform mapping correctly the mutation of interest. Importantly, all steps of our protocol yield results that can be used at different stages by the user: the identification of specific isoforms containing residues involved in selected mutations is <italic>per se</italic> a remarkable clue for genetic assessment of the impact of isoforms, especially by handling a large number of proteins and point mutations; the set of the templates eventually identified by MoNvIso with the section of the target protein covered by them are made available to the user; finally, the structural predictions represent a valuable starting point for additional refinements and investigations, such as molecular dynamics simulations (<xref ref-type="bibr" rid="B21">Raval et al., 2012</xref>; <xref ref-type="bibr" rid="B5">Hollingsworth and Dror, 2018</xref>; <xref ref-type="bibr" rid="B12">Lazim et al., 20202020</xref>; <xref ref-type="bibr" rid="B15">Miller and Phillips, 2021</xref>; <xref ref-type="bibr" rid="B7">Itoh and Okumura, 2022</xref>), hot spots evaluation (<xref ref-type="bibr" rid="B16">Murakami et al., 2017</xref>; <xref ref-type="bibr" rid="B13">Liu et al., 20182018</xref>; <xref ref-type="bibr" rid="B24">Rosell and Fern&#xe1;ndez-Recio, 2018</xref>; <xref ref-type="bibr" rid="B25">Rosensweig et al., 2018</xref>), protein-protein docking (<xref ref-type="bibr" rid="B8">Kangueane and Nilofer, 2018</xref>; <xref ref-type="bibr" rid="B30">van Noort et al., 2021</xref>) and more (<xref ref-type="bibr" rid="B20">Poelwijk et al., 2016</xref>; <xref ref-type="bibr" rid="B23">Rivoire et al., 2016</xref>; <xref ref-type="bibr" rid="B26">Salinas and Ranganathan, 2018</xref>). Finally, note that for isoforms without good quality-templates, users could choose to use predicted structures such as those provided by AF and RosettaFold (<xref ref-type="bibr" rid="B2">Baek et al., 2021</xref>) or other modelling packages and/or protocols to build their own structural models using the isoform(s) correctly associated with the selected point mutations.</p>
<p>The test of MoNvIso on a set of proteins and the comparison with the results of AF confirms the validity of our approach. Additionally, our computational protocol can be easily inserted in a pipeline suitable to perform extensive campaigns of investigation on protein-protein interactions. MoNvIso is particularly useful to evaluate the availability of templates for large sets of proteins and automatically selecting the isoform most suitable to be modelled containing the point mutations of interest. MoNvIso is freely available and can be downloaded from GitHub at the following link: <ext-link ext-link-type="uri" xlink:href="https://github.com/MoNvIsoModeling/MoNvIso">https://github.com/MoNvIsoModeling/MoNvIso</ext-link>, implemented in Python 3.8 and tested on version 3.0, 3.7 and 3.9 and supported on Linux.</p>
</sec>
<sec id="s7">
<title>Key points</title>
<p>
<list list-type="simple">
<list-item>
<p>1) We have developed a computational protocol to map mutations on appropriate isoforms of protein.</p>
</list-item>
<list-item>
<p>2) The protocol identifies the available templates on which mutations can be located.</p>
</list-item>
<list-item>
<p>3) Ranking of the isoforms based on the number of located mutations and the template coverage.</p>
</list-item>
<list-item>
<p>4) Structural models are built for the WT and mutated isoforms if reliable templates are available.</p>
</list-item>
</list>
</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: <ext-link ext-link-type="uri" xlink:href="https://github.com/MoNvIsoModeling/MoNvIso">https://github.com/MoNvIsoModeling/MoNvIso</ext-link>.</p>
</sec>
<sec id="s9">
<title>Author contributions</title>
<p>All authors provided contributions to study design, analysis and interpretation of data, drafting the article or revising it critically for important intellectual content. Here are the most important contributions of each author: PC, DA, OS, SR, and PR designed the study. FO, FM, AG, and PR developed the computational protocol. Data were collected by FO and PR. Analysis was carried out by FO, FM, AG, SR, PC, and PR.</p>
</sec>
<sec id="s10">
<title>Funding</title>
<p>SR received a Wilhelm Bessel Research Award from the Alexander von Humboldt Foundation. JA and OS received funding from the European Union&#x2019;s Horizon 2020 Framework Programme for Research and Innovation under the Specific Grant Agreement No. 945539 (Human Brain Project SGA3).</p>
</sec>
<ack>
<p>PC acknowledges the Deutsche Forschungsgemeinschaft (DFG) <italic>via</italic> the Research Training Group RTG2416 MultiSenses-MultiScales (368482240/GRK2416). We thank Emiliano Ippoliti (J&#xfc;lich), Enrico Gandini (Milan), and Andrea Bosin (Cagliari) for technical support.</p>
</ack>
<sec sec-type="COI-statement" id="s11">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
<p>The handling editor SP declared a past co-authorship with the author AG.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s13">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fchem.2022.1059593/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fchem.2022.1059593/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.docx" id="SM1" mimetype="application/docx" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Altschul</surname>
<given-names>S. F.</given-names>
</name>
<name>
<surname>Gish</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Miller</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Myers</surname>
<given-names>E. W.</given-names>
</name>
<name>
<surname>Lipman</surname>
<given-names>D. J.</given-names>
</name>
</person-group> (<year>1990</year>). <article-title>Basic local alignment search tool</article-title>. <source>J. Mol. Biol.</source> <volume>215</volume>, <fpage>403</fpage>&#x2013;<lpage>410</lpage>. <pub-id pub-id-type="doi">10.1016/s0022-2836(05)80360-2</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Baek</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>DiMaio</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Anishchenko</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Dauparas</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ovchinnikov</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>G. R.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Accurate prediction of protein structures and interactions using a three-track neural network</article-title>. <source>Science</source> <volume>373</volume>, <fpage>871</fpage>&#x2013;<lpage>876</lpage>. <pub-id pub-id-type="doi">10.1126/science.abj8754</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bateman</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Martin</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Orchard</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Magrane</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Agivetova</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ahmad</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>UniProt: The universal protein knowledgebase in 2021</article-title>. <source>Nucleic Acids Res.</source> <volume>49</volume>, <fpage>D480</fpage>&#x2013;<lpage>D489</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkaa1100</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Baum</surname>
<given-names>L. E.</given-names>
</name>
<name>
<surname>Petrie</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>1966</year>). <article-title>Statistical inference for probabilistic functions of finite state Markov chains</article-title>. <source>Ann. Math. Stat.</source> <volume>37</volume>, <fpage>1554</fpage>&#x2013;<lpage>1563</lpage>. <pub-id pub-id-type="doi">10.1214/aoms/1177699147</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hollingsworth</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Dror</surname>
<given-names>R. O.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Molecular dynamics simulation for all</article-title>. <source>Neuron</source> <volume>99</volume>, <fpage>1129</fpage>&#x2013;<lpage>1143</lpage>. <pub-id pub-id-type="doi">10.1016/j.neuron.2018.08.011</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Howe</surname>
<given-names>K. L.</given-names>
</name>
<name>
<surname>Achuthan</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Allen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Allen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Alvarez-Jarreta</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Amode</surname>
<given-names>M. R.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Ensembl 2021</article-title>. <source>Nucleic Acids Res.</source> <volume>49</volume>, <fpage>D884</fpage>&#x2013;<lpage>D891</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkaa942</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Itoh</surname>
<given-names>S. G.</given-names>
</name>
<name>
<surname>Okumura</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>All-Atom molecular dynamics simulation methods for the aggregation of protein and peptides: Replica exchange/permutation and nonequilibrium simulations</article-title>. <source>Methods Mol. Biol.</source> <volume>2340</volume>, <fpage>197</fpage>&#x2013;<lpage>220</lpage>. <pub-id pub-id-type="doi">10.1007/978-1-0716-1546-1_10</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kangueane</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Nilofer</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Protein-protein docking: Methods and tools</article-title>. <source>Protein-Protein Domain-Domain Interact.</source>, <fpage>161</fpage>&#x2013;<lpage>168</lpage>.</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kato</surname>
<given-names>G. J.</given-names>
</name>
<name>
<surname>Piel</surname>
<given-names>F. B.</given-names>
</name>
<name>
<surname>Reid</surname>
<given-names>C. D.</given-names>
</name>
<name>
<surname>Gaston</surname>
<given-names>M. H.</given-names>
</name>
<name>
<surname>Ohene-Frempong</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Krishnamurti</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Sickle cell disease</article-title>. <source>Nat. Rev. Dis. Prim.</source> <volume>4</volume>, <fpage>18010</fpage>. <pub-id pub-id-type="doi">10.1038/nrdp.2018.10</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kermani</surname>
<given-names>A. A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A guide to membrane protein X-ray crystallography</article-title>. <source>FEBS J.</source> <volume>288</volume>, <fpage>5788</fpage>&#x2013;<lpage>5804</lpage>. <pub-id pub-id-type="doi">10.1111/febs.15676</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kuhlman</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Bradley</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Advances in protein structure prediction and design</article-title>. <source>Nat. Rev. Mol. Cell. Biol.</source> <volume>20</volume>, <fpage>681</fpage>&#x2013;<lpage>697</lpage>. <pub-id pub-id-type="doi">10.1038/s41580-019-0163-x</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lazim</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Suh</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Advances in molecular dynamics simulations and enhanced sampling methods for the study of protein systems</article-title>. <source>Int. J. Mol. Sci.</source> <volume>2121</volume>, <fpage>63396339</fpage>. <pub-id pub-id-type="doi">10.3390/ijms21176339</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Machine learning approaches for protein&#x2013;protein interaction hot spot prediction: Progress and comparative assessment</article-title>. <source>Molecules</source> <volume>23</volume>, <fpage>2535</fpage>. <pub-id pub-id-type="doi">10.3390/molecules23102535</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Markwick</surname>
<given-names>P. R. L.</given-names>
</name>
<name>
<surname>Bouvignies</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Blackledge</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Exploring multiple timescale motions in protein GB3 using accelerated molecular dynamics and NMR spectroscopy</article-title>. <source>J. Am. Chem. Soc.</source> <volume>129</volume>, <fpage>4724</fpage>&#x2013;<lpage>4730</lpage>. <pub-id pub-id-type="doi">10.1021/ja0687668</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miller</surname>
<given-names>M. D.</given-names>
</name>
<name>
<surname>Phillips</surname>
<given-names>G. N.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Moving beyond static snapshots: Protein dynamics and the protein data bank</article-title>. <source>J. Biol. Chem.</source> <volume>296</volume>, <fpage>100749</fpage>. <pub-id pub-id-type="doi">10.1016/j.jbc.2021.100749</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Murakami</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Tripathi</surname>
<given-names>L. P.</given-names>
</name>
<name>
<surname>Prathipati</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Mizuguchi</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Network analysis and <italic>in silico</italic> prediction of protein&#x2013;protein interactions with applications in drug discovery</article-title>. <source>Curr. Opin. Struct. Biol.</source> <volume>44</volume>, <fpage>134</fpage>&#x2013;<lpage>142</lpage>. <pub-id pub-id-type="doi">10.1016/j.sbi.2017.02.005</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Murata</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wolf</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Cryo-electron microscopy for structural analysis of dynamic biological macromolecules</article-title>. <source>Biochimica Biophysica Acta - General Subj.</source> <volume>1862</volume>, <fpage>324</fpage>&#x2013;<lpage>334</lpage>. <pub-id pub-id-type="doi">10.1016/j.bbagen.2017.07.020</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Papadopoulos</surname>
<given-names>J. S.</given-names>
</name>
<name>
<surname>Agarwala</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Cobalt: Constraint-based alignment tool for multiple protein sequences</article-title>. <source>Bioinformatics</source> <volume>23</volume>, <fpage>1073</fpage>&#x2013;<lpage>1079</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btm076</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Park</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Xing</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>The expanding landscape of alternative splicing variation in human populations</article-title>. <source>Am. J. Hum. Genet.</source> <volume>102</volume>, <fpage>11</fpage>&#x2013;<lpage>26</lpage>. <pub-id pub-id-type="doi">10.1016/j.ajhg.2017.11.002</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Poelwijk</surname>
<given-names>F. J.</given-names>
</name>
<name>
<surname>Krishna</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Ranganathan</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>The context-dependence of mutations: A linkage of formalisms</article-title>. <source>PLOS Comput. Biol.</source> <volume>12</volume>, <fpage>e1004771</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1004771</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Raval</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Piana</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Eastwood</surname>
<given-names>M. P.</given-names>
</name>
<name>
<surname>Dror</surname>
<given-names>R. O.</given-names>
</name>
<name>
<surname>Shaw</surname>
<given-names>D. E.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Refinement of protein structure homology models via long, all-atom molecular dynamics simulations</article-title>. <source>Proteins.</source> <volume>80</volume>, <fpage>2071</fpage>&#x2013;<lpage>2079</lpage>. <pub-id pub-id-type="doi">10.1002/prot.24098</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rees</surname>
<given-names>D. C.</given-names>
</name>
<name>
<surname>Williams</surname>
<given-names>T. N.</given-names>
</name>
<name>
<surname>Gladwin</surname>
<given-names>M. T.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Sickle-cell disease</article-title>. <source>Lancet</source> <volume>376</volume>, <fpage>2018</fpage>&#x2013;<lpage>2031</lpage>. <pub-id pub-id-type="doi">10.1016/s0140-6736(10)61029-x</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rivoire</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Reynolds</surname>
<given-names>K. A.</given-names>
</name>
<name>
<surname>Ranganathan</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Evolution-based functional decomposition of proteins</article-title>. <source>PLoS Comput. Biol.</source> <volume>12</volume>, <fpage>1004817</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1004817</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rosell</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Fern&#xe1;ndez-Recio</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Hot-spot analysis for drug discovery targeting protein-protein interactions</article-title>. <source>Expert Opin. Drug Discov.</source> <volume>13</volume>, <fpage>327</fpage>&#x2013;<lpage>338</lpage>. <pub-id pub-id-type="doi">10.1080/17460441.2018.1430763</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rosensweig</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Reynolds</surname>
<given-names>K. A.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Laothamatas</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Shan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ranganathan</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>An evolutionary hotspot defines functional differences between CRYPTOCHROMES</article-title>. <source>Nat. Commun.</source> <volume>9</volume>, <fpage>1138</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-018-03503-6</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Salinas</surname>
<given-names>V. H.</given-names>
</name>
<name>
<surname>Ranganathan</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Coevolution-based inference of amino acid interactions underlying protein function</article-title>. <source>Elife</source> <volume>7</volume>, <fpage>e34300</fpage>. <pub-id pub-id-type="doi">10.7554/elife.34300</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sali</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2006</year>). <article-title>Statistical potential for assessment and prediction of protein structures</article-title>. <source>Protein Sci.</source> <volume>15</volume>, <fpage>2507</fpage>&#x2013;<lpage>2524</lpage>. <pub-id pub-id-type="doi">10.1110/ps.062416606</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Slabinski</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Jaroszewski</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Rodrigues</surname>
<given-names>A. P. C.</given-names>
</name>
<name>
<surname>Rychlewski</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Wilson</surname>
<given-names>I. A.</given-names>
</name>
<name>
<surname>Lesley</surname>
<given-names>S. A.</given-names>
</name>
<etal/>
</person-group> (<year>2007</year>). <article-title>The challenge of protein structure determination-lessons from structural genomics</article-title>. <source>Protein Sci.</source> <volume>16</volume>, <fpage>2472</fpage>&#x2013;<lpage>2482</lpage>. <pub-id pub-id-type="doi">10.1110/ps.073037907</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tunyasuvunakool</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Adler</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Green</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zielinski</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zidek</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Highly accurate protein structure prediction for the human proteome</article-title>. <source>Nature</source> <volume>596</volume>, <fpage>590</fpage>&#x2013;<lpage>596</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-021-03828-1</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>van Noort</surname>
<given-names>C. W.</given-names>
</name>
<name>
<surname>Honorato</surname>
<given-names>R. V.</given-names>
</name>
<name>
<surname>Bonvin</surname>
<given-names>A. M. J. J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Information-driven modeling of biomolecular complexes</article-title>. <source>Curr. Opin. Struct. Biol.</source> <volume>70</volume>, <fpage>70</fpage>&#x2013;<lpage>77</lpage>. <pub-id pub-id-type="doi">10.1016/j.sbi.2021.05.003</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Waterhouse</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bertoni</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bienert</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Studer</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Tauriello</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gumienny</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>SWISS-MODEL: Homology modelling of protein structures and complexes</article-title>. <source>Nucleic Acids Res.</source> <volume>46</volume>, <fpage>W296</fpage>&#x2013;<lpage>W303</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gky427</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Webb</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Sali</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Comparative protein structure modeling using MODELLER</article-title>. <source>Curr. Protoc. Bioinforma.</source> <volume>2016</volume>, <fpage>56</fpage>&#x2013;<lpage>57</lpage>.</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Roy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Poisson</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>The I-TASSER suite: Protein structure and function prediction</article-title>. <source>Nat. Methods</source> <volume>12</volume>, <fpage>7</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1038/nmeth.3213</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>