<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Pharmacol.</journal-id>
<journal-title>Frontiers in Pharmacology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Pharmacol.</abbrev-journal-title>
<issn pub-type="epub">1663-9812</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">837715</article-id>
<article-id pub-id-type="doi">10.3389/fphar.2022.837715</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Pharmacology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Pocket2Drug: An Encoder-Decoder Deep Neural Network for the Target-Based Drug Design</article-title>
<alt-title alt-title-type="left-running-head">Shi et&#x20;al.</alt-title>
<alt-title alt-title-type="right-running-head">Drug Design with Deep Learning</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Shi</surname>
<given-names>Wentao</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1603515/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Singha</surname>
<given-names>Manali</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Srivastava</surname>
<given-names>Gopal</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Pu</surname>
<given-names>Limeng</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/515188/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ramanujam</surname>
<given-names>J.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Brylinski</surname>
<given-names>Michal</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/29325/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Division of Electrical and Computer Engineering</institution>, <institution>Louisiana State University</institution>, <addr-line>Baton Rouge</addr-line>, <addr-line>LA</addr-line>, <country>United&#x20;States</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Department of Biological Sciences</institution>, <institution>Louisiana State University</institution>, <addr-line>Baton Rouge</addr-line>, <addr-line>LA</addr-line>, <country>United&#x20;States</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Center for Computation and Technology</institution>, <institution>Louisiana State University</institution>, <addr-line>Baton Rouge</addr-line>, <addr-line>LA</addr-line>, <country>United&#x20;States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/414005/overview">Adriano D. Andricopulo</ext-link>, University of Sao Paulo, Brazil</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1005782/overview">Jahan B. Ghasemi</ext-link>, University of Tehran,&#x20;Iran</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/314366/overview">Guang Hu</ext-link>, Soochow University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Michal Brylinski, <email>michal@brylinski.org</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Experimental Pharmacology and Drug Discovery, a section of the journal Frontiers in Pharmacology</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>11</day>
<month>03</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>13</volume>
<elocation-id>837715</elocation-id>
<history>
<date date-type="received">
<day>17</day>
<month>12</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>10</day>
<month>02</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Shi, Singha, Srivastava, Pu, Ramanujam and Brylinski.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Shi, Singha, Srivastava, Pu, Ramanujam and Brylinski</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these&#x20;terms.</p>
</license>
</permissions>
<abstract>
<p>Computational modeling is an essential component of modern drug discovery. One of its most important applications is to select promising drug candidates for pharmacologically relevant target proteins. Because of continuing advances in structural biology, putative binding sites for small organic molecules are being discovered in numerous proteins linked to various diseases. These valuable data offer new opportunities to build efficient computational models predicting binding molecules for target sites through the application of data mining and machine learning. In particular, deep neural networks are powerful techniques capable of learning from complex data in order to make informed drug binding predictions. In this communication, we describe Pocket2Drug, a deep graph neural network model to predict binding molecules for a given a ligand binding site. This approach first learns the conditional probability distribution of small molecules from a large dataset of pocket structures with supervised training, followed by the sampling of drug candidates from the trained model. Comprehensive benchmarking simulations show that using Pocket2Drug significantly improves the chances of finding molecules binding to target pockets compared to traditional drug selection procedures. Specifically, known binders are generated for as many as 80.5% of targets present in the testing set consisting of dissimilar data from that used to train the deep graph neural network model. Overall, Pocket2Drug is a promising computational approach to inform the discovery of novel biopharmaceuticals.</p>
</abstract>
<kwd-group>
<kwd>ligand binding sites</kwd>
<kwd>drug discovery and development</kwd>
<kwd>in silico drug design</kwd>
<kwd>deep learning</kwd>
<kwd>graph neural network</kwd>
<kwd>recurrent neural network</kwd>
<kwd>generative model</kwd>
<kwd>machine learning</kwd>
</kwd-group>
<contract-num rid="cn001">R35GM119524</contract-num>
<contract-num rid="cn002">CCF1619303</contract-num>
<contract-num rid="cn003">LEQSF (2016-19)-RD-B03</contract-num>
<contract-sponsor id="cn001">National Institute of General Medical Sciences<named-content content-type="fundref-id">10.13039/100000057</named-content>
</contract-sponsor>
<contract-sponsor id="cn002">National Science Foundation<named-content content-type="fundref-id">10.13039/100000001</named-content>
</contract-sponsor>
<contract-sponsor id="cn003">Louisiana Board of Regents<named-content content-type="fundref-id">10.13039/100006952</named-content>
</contract-sponsor>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>Recent developments in genomics revealed novel disease-related molecular targets, many of which are yet to be characterized with respect to the possibility of modulating their functions with pharmaceutical agents. Another challenge in pharmacotherapy arises from resistance effects to existing drugs complicating the treatment of particularly infectious diseases (<xref ref-type="bibr" rid="B48">Trebosc et&#x20;al., 2019</xref>) and cancer (<xref ref-type="bibr" rid="B46">Shou et&#x20;al., 2004</xref>). Therefore, many drug development projects are focused on the discovery of small molecule therapeutics with new mode of action (<xref ref-type="bibr" rid="B16">Gerry and Schreiber, 2018</xref>). Generating novel small molecules is a difficult endeavor due to the high complexity of biological systems and the enormous size of chemical space of organic compounds. Traditional experimental techniques can be used to identify drug-like molecules performing specific biochemical tasks by binding to macromolecular targets with a high specificity in order to modulate their cellular functions. Nonetheless, even advanced high-throughput screening methods have notable limitations due to the long time and high costs of screening a large number of drug candidates.</p>
<p>To make the drug discovery process more efficient, modern approaches incorporate miscellaneous computational components. Virtual screening (VS) is perhaps the most widely used strategy to help identify potentially bioactive molecules from large collections of commercially available as well as virtual compounds (<xref ref-type="bibr" rid="B43">Segler et&#x20;al., 2018</xref>). Despite its utility, this technology has certain drawbacks such as high false-positive rates, the requirement of predefined ligand libraries for structure-based VS, oversimplified scoring functions, and protein structure frameworks absent in ligand-based VS (<xref ref-type="bibr" rid="B53">Wu et&#x20;al., 2019</xref>). More recently, machine learning (ML) methods addressing many of these issues have become available for drug discovery. New ML techniques include a quantitative structure-activity relationship model to predict the target affinity, toxicity, and side effects (<xref ref-type="bibr" rid="B36">Mouchlis et&#x20;al., 2021</xref>) and an approach to model polypharmacy side effects with graph convolutional networks (GCN) (<xref ref-type="bibr" rid="B63">Zitnik et&#x20;al., 2018</xref>).</p>
<p>Deep learning (DL) is a family of modern machine leaning models utilizing deep neural networks (DNNs). DL models have been demonstrated to be powerful feature extractors for ligand binding site classifiers (<xref ref-type="bibr" rid="B25">Jim&#xe9;nez et&#x20;al., 2017</xref>; <xref ref-type="bibr" rid="B42">Pu et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B44">Shi et&#x20;al., 2020</xref>) and metric learning models for binding sites in proteins (<xref ref-type="bibr" rid="B47">Simonovsky and Meyers, 2020</xref>). Recurrent neural networks (RNNs) are iterative DL models that generate sequences through multiple iterations. In each iteration, the RNN model generates an output of time <inline-formula id="inf1">
<mml:math id="m1">
<mml:mi>t</mml:mi>
</mml:math>
</inline-formula> taking the output of iteration <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> as the input. According to the probabilistic language model (<xref ref-type="bibr" rid="B18">Graves, 2013</xref>), the probability of input token <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is modeled as <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, which is the probability of <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> conditioned on the output token <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from the previous iteration. This powerful methodology was applied to <italic>de novo</italic> drug discovery, where RNNs were trained to model the probability distribution of a drug dataset (<xref ref-type="bibr" rid="B13">Ertl et&#x20;al., 2017</xref>; <xref ref-type="bibr" rid="B43">Segler et&#x20;al., 2018</xref>; <xref ref-type="bibr" rid="B19">Gupta et&#x20;al., 2018</xref>; <xref ref-type="bibr" rid="B58">Yasonik, 2020</xref>). These methods treat a drug dataset as a set of languages and employ an RNN to learn the corresponding language models. After the training stage is completed, the RNN learns the probability distribution <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> of the drug dataset, from which molecules can be sampled. RNN-based approaches often represent molecules using a simplified molecular-input line-entry system (SMILES) (<xref ref-type="bibr" rid="B52">Weininger, 1988</xref>), where individual string characters represent tokens of time steps. Although using RNNs to learn the distributions of drug datasets offers new opportunities to find drugs, these techniques still employ a random search of the chemical space leading to long virtual screening times. From a computational standpoint, when the aim is to identify promising lead molecules against a target binding site, it is certainly advantageous to have the search space significantly reduced.</p>
<p>In order to achieve this goal, we developed Pocket2Drug, a new deep generative model with the encoder-decoder architecture. Inspired by the framework of image captioning models taking images as the input to generate corresponding captions (<xref ref-type="bibr" rid="B50">Vinyals et&#x20;al., 2015</xref>; <xref ref-type="bibr" rid="B55">Xu et&#x20;al., 2015</xref>), the basic idea is to provide RNN with the prior information on ligand binding pockets to improve the chances of finding bioactive molecules. A typical image captioning model consists of two parts, an encoder/feature extractor and a decoder. A convolutional neural network (CNN) is often used as the encoder extracting fixed-size latent feature vectors from the input images containing the prior information that can subsequently be decoded by an RNN to generate image captions. Formally, image captioning models learn the probability of sequences conditioned on prior information, i.e.,&#x20;<inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>m</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>Pocket2Drug has a similar encoder-decoder architecture consisting of an encoder to extract features and a decoder to generate molecules. Nonetheless, Pocket2Drug differs from typical image captioning models in that it employs a graph representation of drug binding sites instead of images. Consequently, a GNN is employed as the encoder to extract the prior information from input pockets followed by an RNN decoder to generate molecule strings, which are the equivalents of image captions. In comprehensive benchmarking simulations against ligand-bound, ligand-free, and low-homology datasets of binding sites, we show that Pocket2Drug employing the encoder-decoder DNN effectively predicts binding drugs for input pocket structures.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>Materials and Methods</title>
<sec id="s2-1">
<title>Datasets</title>
<p>Datasets used in this study were compiled from a non-redundant library of 51,677 pockets with bound ligands constructed for binding site prediction with <italic>e</italic>FindSite (<xref ref-type="bibr" rid="B7">Brylinski and Feinstein, 2013</xref>). The redundancy in the original library was already removed by excluding proteins with the template modeling (TM)-score, measuring the structure similarity (<xref ref-type="bibr" rid="B61">Zhang and Skolnick, 2004</xref>), of &#x2265;0.4 and the 3D Tanimoto coefficient (TC), measuring the ligand similarity (<xref ref-type="bibr" rid="B27">Kawabata, 2011</xref>), of &#x2265;0.7. We further filtered the dataset based on the synthetic accessibility (SA) score (<xref ref-type="bibr" rid="B12">Ertl and Schuffenhauer, 2009</xref>) removing low- and high-complexity compounds whose SA scores are &#x2264;1 and &#x2265;6, respectively. This procedure resulted in a high-quality dataset of 48,365 pockets binding small organic compounds, which were randomly split into training (90%) and testing (10%) subsets. The training subset of 43,529 pockets is referred to as the Pocket2Drug-train dataset while the remaining 4,836 (testing) pockets are called the Pocket2Drug-holo dataset.</p>
<p>Next, 433 pockets having a protein sequence identity of &#x2264;0.5 with pockets in the training subset were selected from the Pocket2Drug-holo dataset creating the Pocket2Drug-lowhomol dataset to evaluate the ability to generalize to unseen data. Finally, the basic local alignment search tool (BLAST) (<xref ref-type="bibr" rid="B3">Altschul et&#x20;al., 1990</xref>) was used with a sequence identity threshold of 95% to identify the apo structures of Pocket2Drug-holo proteins in the Protein Data Bank (PDB) (<xref ref-type="bibr" rid="B6">Berman et&#x20;al., 2002</xref>). Ligand-free structures were then aligned on the corresponding holo-proteins with TM-align (<xref ref-type="bibr" rid="B62">Zhang and Skolnick, 2005</xref>) and those producing significant alignments with a TM-score of &#x2265;0.5 (<xref ref-type="bibr" rid="B54">Xu and Zhang, 2010</xref>) were retained. This procedure resulted in 828&#x20;ligand-free pockets referred to as the Pocket2Drug-apo dataset.</p>
</sec>
<sec id="s2-2">
<title>Graph Representation of Pockets</title>
<p>Binding pockets are represented as graphs, in which nodes are non-hydrogen atoms and edges connect pairs of atoms spatially located within 4.5&#xa0;&#xc5; from one another (<xref ref-type="bibr" rid="B45">Shi et&#x20;al., 2021</xref>). Node features include the hydrophobicity (<xref ref-type="bibr" rid="B33">Mahn et&#x20;al., 2009</xref>), the charge, the binding probability (<xref ref-type="bibr" rid="B24">Jian et&#x20;al., 2016</xref>), the solvent accessible surface area (<xref ref-type="bibr" rid="B2">Ali et&#x20;al., 2014</xref>), and the sequence entropy (<xref ref-type="bibr" rid="B32">Liao et&#x20;al., 2005</xref>), whereas the edge attribute is the bond multiplicity for covalently bonded atoms and 0 for atoms interacting non-covalently. Pockets are centered at the origin with principal axes aligned to Cartesian axes. The coordinates of individual atoms are also used as node features in order to provide the additional 3D information on binding pockets. This graph representation of ligand binding sites was used to accurately classify pockets in protein structures with GraphSite (<xref ref-type="bibr" rid="B45">Shi et&#x20;al., 2021</xref>).</p>
</sec>
<sec id="s2-3">
<title>Encoder-Decoder Architecture</title>
<p>Pocket2Drug is implemented in PyTorch v1.7.1 (<xref ref-type="bibr" rid="B41">Paszke et&#x20;al., 2019</xref>) and employs a DNN with the encoder-decoder architecture. The model learns the probability distribution of molecules conditioned on ligand binding pockets, <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, which is then used to sample molecules for a given pocket as the prior condition. The pipeline implemented in Pocket2Drug is illustrated in <xref ref-type="fig" rid="F1">Figure&#x20;1</xref>. For the input binding site (<xref ref-type="fig" rid="F1">Figure&#x20;1A</xref>), a graph representation is generated by GraphSite (<xref ref-type="bibr" rid="B45">Shi et&#x20;al., 2021</xref>) (<xref ref-type="fig" rid="F1">Figure&#x20;1B</xref>) and the resulting graph is processed by an encoder to generate a fixed-size graph embedding (<xref ref-type="fig" rid="F1">Figure&#x20;1C</xref>). As the encoder, we use a GNN constructed by removing the fully connected layers of the GraphSite classifier with parameters pretrained on binding site classification tasks (<xref ref-type="bibr" rid="B45">Shi et&#x20;al., 2021</xref>). Subsequently, an RNN decoder takes the generated embedding vector as the input to compute SMILES sequences representing binding drugs (<xref ref-type="fig" rid="F1">Figure&#x20;1D</xref>). Pocket2Drug is trained in an end-to-end fashion meaning that the parameters of both encoder and decoder are updated during backpropagation.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Flowchart of Pocket2Drug. The input ligand-binding pocket <bold>(A)</bold> is first represented as a graph <bold>(B)</bold> and then used by the encoder graph neural network to generate a fixed-size graph embedding <bold>(C)</bold>. The decoder recurrent neural network generates molecule strings <bold>(D)</bold> from the graph embedding.</p>
</caption>
<graphic xlink:href="fphar-13-837715-g001.tif"/>
</fig>
</sec>
<sec id="s2-4">
<title>Graph Neural Network Encoder</title>
<p>The GNN encoder extracts latent features from the input pocket graphs. We use the embedding network implemented in the GraphSite classifier as the feature extractor with the last fully connected layer removed and the remaining parts of the classifier employed as the feature extractor. The message passing function utilizes weighted neighbor node features, in which weights are generated by a two-layer, fully connected neural network taking edge features as the input. Updated node features in <inline-formula id="inf10">
<mml:math id="m10">
<mml:mi>k</mml:mi>
</mml:math>
</inline-formula>-th layer of node <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, defined as<disp-formula id="e1">
<mml:math id="m12">
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>k</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>&#x3b8;</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:munder>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b5;</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:munder>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mstyle>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x22c5;</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>j</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>are first computed as a weighted sum of the first-order neighbors. The features of <inline-formula id="inf12">
<mml:math id="m13">
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> are weighted by <inline-formula id="inf13">
<mml:math id="m14">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi mathvariant="italic">&#x3f5;</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf14">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="italic">&#x3f5;</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a trainable parameter. The weights of the first-order neighbors are generated by a neural network <inline-formula id="inf15">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>&#x3c9;</mml:mi>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> taking the edge feature, <inline-formula id="inf16">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, as the input. Then, multiple channels of the weighted sum of the node features are concatenated and updated by another neural network <inline-formula id="inf17">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>&#x3b8;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Finally, the output of each layer is connected by the jumping knowledge (JK)-network (<xref ref-type="bibr" rid="B56">Xu et&#x20;al., 2018</xref>). The JK-network enables an automatic selection of the number of layers for individual nodes. Finally, the initial node embeddings are processed by the Set2Set graph read-out layer (<xref ref-type="bibr" rid="B49">Vinyals et&#x20;al., 2016</xref>) to construct final, fixed-size graph embeddings.</p>
</sec>
<sec id="s2-5">
<title>Recurrent Neural Network Decoder</title>
<p>As a decoder, we use the gated recurrent unit (GRU), which is a variation of the vanilla RNN (<xref ref-type="bibr" rid="B10">Cho et&#x20;al., 2014</xref>). The decoder network models a conditional probability of the output sequence based on the prior information on a ligand binding pocket:<disp-formula id="e2">
<mml:math id="m19">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x220f;</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>n</mml:mi>
</mml:munderover>
<mml:mi>P</mml:mi>
</mml:mstyle>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <inline-formula id="inf18">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the token of a molecule string at iteration <inline-formula id="inf19">
<mml:math id="m21">
<mml:mi>t</mml:mi>
</mml:math>
</inline-formula>, and <inline-formula id="inf20">
<mml:math id="m22">
<mml:mi>n</mml:mi>
</mml:math>
</inline-formula> is the length of the output string. Note that <inline-formula id="inf21">
<mml:math id="m23">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the &#x201c;end of string&#x201d;, or <inline-formula id="inf22">
<mml:math id="m24">
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, token. <xref ref-type="fig" rid="F2">Figure&#x20;2</xref> shows that the GRU network works differently during training and inference stages. During training, the graph embedding is taken by the GRU as the prior information to model the probability distribution of all tokens, where the probability of a token <inline-formula id="inf23">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is <inline-formula id="inf24">
<mml:math id="m26">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. In the remaining iterations, input tokens <inline-formula id="inf25">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of the binding drug string are mapped to vectors by the embedding layer and passed to the GRU as the input. The GRU then predicts the next token by generating another probability distribution <inline-formula id="inf26">
<mml:math id="m28">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. The negative log likelihood of the binding drug is used as the loss function:<disp-formula id="e3">
<mml:math id="m29">
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mi>n</mml:mi>
</mml:munderover>
<mml:mrow>
<mml:mi>log</mml:mi>
</mml:mrow>
</mml:mstyle>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Architecture of the recurrent neural network decoder. The decoder employs multiple gated recurrent units (GRUs). During model training, the molecule strings of binding drugs are used as the input. Dashed arrows represent the inference stage, in which the token sampled from <inline-formula id="inf27">
<mml:math id="m30">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is used as the input at iteration <inline-formula id="inf28">
<mml:math id="m31">
<mml:mi>t</mml:mi>
</mml:math>
</inline-formula>.</p>
</caption>
<graphic xlink:href="fphar-13-837715-g002.tif"/>
</fig>
<p>Dashed arrows in <xref ref-type="fig" rid="F2">Figure&#x20;2</xref> represent the inference stage. Here, the first iteration is the same as during training, i.e.,&#x20;the encoder generates graph embeddings used as the input in the first iteration. However, in the subsequent iterations, the RNN model takes the token <inline-formula id="inf29">
<mml:math id="m32">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, sampled from the distribution of the previous step, to generate the distribution <inline-formula id="inf30">
<mml:math id="m33">
<mml:mrow>
<mml:msub>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The inference stops when the <inline-formula id="inf31">
<mml:math id="m34">
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> token is reached.</p>
</sec>
<sec id="s2-6">
<title>Tokenization Scheme</title>
<p>Molecules can be represented by strings encoded by different tokenization schemes. Although SMILES is a widely used molecular string system, it was not designed for ML applications. Because of a strict syntax of SMILES, a significant portion of molecules generated by machine learning models are invalid. In addition, parentheses and ring indicators may be separated by long distances in SMILES strings causing problems for RNNs that have difficulty learning long-term dependencies (<xref ref-type="bibr" rid="B40">&#xd6;zt&#xfc;rk et&#x20;al., 2020</xref>). This issue can be addressed by improving either the RNN model or the tokenization scheme. For instance, RNN variants implementing &#x201c;shortcuts&#x201d; were developed to model long-term dependencies (<xref ref-type="bibr" rid="B21">Hochreiter and Schmidhuber, 1997</xref>). A long short-term memory (LSTM) model can also be used instead of a vanilla RNN in <italic>de novo</italic> drug design applications to learn the distribution of a drug dataset (<xref ref-type="bibr" rid="B13">Ertl et&#x20;al., 2017</xref>). Another workaround is to improve the tokenization scheme to make the string representation of molecules more suitable for ML applications. An example is DeepSMILES developed to enhance DL-based models taking SMILES as the input (<xref ref-type="bibr" rid="B39">O&#x2019;Boyle and Dalke, 2018</xref>).</p>
<p>Pocket2Drug employs SELF-referencing Embedding Strings (SELFIES), another molecule tokenization scheme designed for machine learning applications (<xref ref-type="bibr" rid="B29">Krenn et&#x20;al., 2020</xref>). The SELFIES method was selected because of several important properties. Not only any molecule can be represented by a SELFIES string, but also all virtual molecules generated by an ML model are valid. Importantly, the information on rings and branches in SELFIES is localized by storing the branch size and ring size together with their identifiers. This tokenization scheme makes it easier for RNNs to learn from the &#x201c;past&#x201d; information compared to, e.g., SMILES that require RNNs to infer ring/branch indicators based on non-localized information.</p>
</sec>
</sec>
<sec id="s3">
<title>Evaluation and Results</title>
<p>Pocket2Drug was trained on the Pocket2Drug-train dataset and validated against Pocket2Drug-holo, -apo, and -lowhomol datasets. We first analyze the size of molecules generated for the Pocket2Drug-holo dataset. <xref ref-type="fig" rid="F3">Figure&#x20;3</xref> shows that there is a notable correlation between the size of pockets and the size of binding molecules, referred to as label ligands, across experimental complex structures (blue bars). Encouragingly, the size of ligands constructed by Pocket2Drug is also correlated with the pocket size, although these molecules tend to be somewhat smaller than the corresponding label ligands (green bars). This result can be attributed to the fact that capturing longer dependencies in molecular strings is more difficult for the RNN trained to minimize the sum of cross-entropy loss function. In other words, the model makes fewer mistakes by generating smaller molecules.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Relationship between the ligand size and the size of binding pockets. The size of ligands and pockets is quantified by the number of non-hydrogen atoms. Binding pockets are assigned to four size groups: &#x3c;100, 100&#x2013;160, 161&#x2013;220, and &#x3e;220 atoms. For each pocket group, quartiles and the interquartile range are calculated for the size of label ligands (blue bars) and those molecules generated by Pocket2Drug (green bars).</p>
</caption>
<graphic xlink:href="fphar-13-837715-g003.tif"/>
</fig>
<p>Next, the quality of molecules generated for the Pocket2Drug-holo dataset is evaluated using two complementing protocols, one based on the chemical similarity of binding molecules (<xref ref-type="bibr" rid="B4">Baldi and Nasr, 2010</xref>) and another utilizing the structure alignments of protein pockets (<xref ref-type="bibr" rid="B59">Yeturu and Chandra, 2011</xref>). Pocket2Drug is compared to two baselines. The first method randomly selects drug candidates from the ZINC database, a curated collection of commercially available chemical compounds prepared specifically for virtual screening (<xref ref-type="bibr" rid="B23">Irwin and Shoichet, 2005</xref>). The second baseline method selects drug candidates from the output of a vanilla RNN (<xref ref-type="bibr" rid="B43">Segler et&#x20;al., 2018</xref>) representing a typical DL-based approach for <italic>de novo</italic> drug design.</p>
<sec id="s3-1">
<title>Evaluation by Ligand Chemical Similarity</title>
<p>The performance of Pocket2Drug, ZINC, and vanilla RNN are evaluated with the TC between the generated molecules and label ligands. For each pocket in the Pocket2Drug-holo dataset, TC values are calculated for a specified number of molecules sampled from the model output and the highest TC is selected as the final score. <xref ref-type="table" rid="T1">Table&#x20;1</xref> reports the percentage of Pocket2Drug-holo pockets with the corresponding score greater than or equal to a TC threshold ranging from 0.7 to 1.0. Encouragingly, using Pocket2Drug significantly improves chances to find binding molecules compared to ZINC and vanilla RNN. For a sample size of 20,480 (10 batches of 2,048 molecules each to maximize the GPU utilization), Pocket2Drug generates at least one molecule which a TC of &#x2265;0.7 to the label ligand for as many as 95.9% pockets. Note that two molecules sharing chemical similarity with a TC of &#x2265;0.7 tend to have a similar bioactivity (<xref ref-type="bibr" rid="B30">Kumar, 2011</xref>; <xref ref-type="bibr" rid="B5">Ben Lo, 2016</xref>). For the majority of pockets (52.5%), Pocket2Drug selects the label ligand itself (a TC of 1.0). This performance is significantly higher than that of ZINC/vanilla RNN that selects ligands with a TC of &#x2265;0.7 for 58.9%/57.1% of pockets and label ligands for merely 0.4%/0.1% of pockets. Increasing the sample size to 81,920 slightly improves the performance because four times more molecules are used to select that with the highest TC value. A significantly improved performance of Pocket2Drug over vanilla RNN can be attributed to the effective utilization of the prior information on ligand binding pockets learned by the ML&#x20;model.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Hit rates for the Pocket2Drug-holo dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Method</th>
<th colspan="4" align="center">Sample size of 20,480</th>
<th colspan="4" align="center">Sample size of 81,920</th>
</tr>
<tr>
<th align="center">
<italic>TC &#x2265; 0.7</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.8</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.9</italic> (%)</th>
<th align="center">
<italic>TC &#x3d;1.0</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.7</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.8</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.9</italic> (%)</th>
<th align="center">
<italic>TC &#x3d;1.0</italic> (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Pocket2Drug</td>
<td align="char" char=".">95.9</td>
<td align="char" char=".">79.9</td>
<td align="char" char=".">64.8</td>
<td align="char" char=".">52.5</td>
<td align="char" char=".">98.4</td>
<td align="char" char=".">86.8</td>
<td align="char" char=".">69.7</td>
<td align="char" char=".">56.4</td>
</tr>
<tr>
<td align="left">ZINC</td>
<td align="char" char=".">58.9</td>
<td align="char" char=".">23.8</td>
<td align="char" char=".">3.3</td>
<td align="char" char=".">0.4</td>
<td align="char" char=".">73.6</td>
<td align="char" char=".">40.5</td>
<td align="char" char=".">8.4</td>
<td align="char" char=".">1.2</td>
</tr>
<tr>
<td align="left">Vanilla RNN</td>
<td align="char" char=".">57.1</td>
<td align="char" char=".">19.7</td>
<td align="char" char=".">1.6</td>
<td align="char" char=".">0.1</td>
<td align="char" char=".">70.9</td>
<td align="char" char=".">35.3</td>
<td align="char" char=".">4.7</td>
<td align="char" char=".">0.3</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Next, the performance of Pocket2Drug is assessed against the Pocket2Drug-apo dataset. The mean root-mean-square deviation (RMSD) (<xref ref-type="bibr" rid="B26">Kabsch, 1976</xref>) of ligand-free structures against ligand-bound conformations is 1.2&#xa0;&#xc5;&#x20;&#xb1; 0.9. This low RMSD is expected because, with a few exceptions, the structures of apo- and holo-proteins tend to be highly similar (<xref ref-type="bibr" rid="B8">Brylinski and Skolnick, 2008</xref>). <xref ref-type="table" rid="T2">Table&#x20;2</xref> reports hit rates for molecules generated by Pocket2Drug using ligand-free and the corresponding ligand-bound pockets in the Pocket2Drug-holo dataset. Encouragingly, the performance of Pocket2Drug is independent on the ligand binding state of target proteins, therefore, the model does not require input proteins to be co-crystallized with ligands in order to successfully generate binding molecules.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Hit rates for the Pocket2Drug-apo dataset. For each ligand-free structure, the corresponding ligand-bound pocket is selected from the Pocket2Drug-holo dataset for the apples-to-apples comparison.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Binding state</th>
<th colspan="4" align="center">Sample size of 20,480</th>
<th colspan="4" align="center">Sample size of 81,920</th>
</tr>
<tr>
<th align="center">
<italic>TC &#x2265; 0.7</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.8</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.9</italic> (%)</th>
<th align="center">
<italic>TC &#x3d;1.0</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.7</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.8</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.9</italic> (%)</th>
<th align="center">
<italic>TC &#x3d;1.0</italic> (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Ligand-free</td>
<td align="char" char=".">95.3</td>
<td align="char" char=".">72.7</td>
<td align="char" char=".">53.3</td>
<td align="char" char=".">37.4</td>
<td align="char" char=".">98.2</td>
<td align="char" char=".">82.2</td>
<td align="char" char=".">57.2</td>
<td align="char" char=".">40.5</td>
</tr>
<tr>
<td align="left">Ligand-bound</td>
<td align="char" char=".">95.3</td>
<td align="char" char=".">72.2</td>
<td align="char" char=".">52.3</td>
<td align="char" char=".">37.0</td>
<td align="char" char=".">98.2</td>
<td align="char" char=".">81.6</td>
<td align="char" char=".">58.1</td>
<td align="char" char=".">41.2</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>We also evaluate the ability of Pocket2Drug to generalize to unseen data by measuring its performance against the Pocket2Drug-lowhomol dataset. As reported in <xref ref-type="table" rid="T3">Table&#x20;3</xref>, label ligands (a TC of 1.0) are generated by Pocket2Drug in 77.1%/80.5% of the cases when the sample size is 20,480/81,920. This performance represents a notable improvement over ZINC and vanilla RNN selecting a very few label ligands. Pocket2Drug also achieves the highest performance for other TC thresholds ranging from 0.7 to 0.9. These results show that Pocket2Drug not only performs exceptionally well against Pocket2Drug-holo and -apo datasets, but also against the Pocket2Drug-lowhomol dataset comprising proteins with a low sequence homology to the training subset demonstrating that it generalizes well to unseen&#x20;data.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Hit rates for the Pocket2Drug-lowhomol dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Method</th>
<th colspan="4" align="center">Sample size of 20,480</th>
<th colspan="4" align="center">Sample size of 81,920</th>
</tr>
<tr>
<th align="center">
<italic>TC &#x2265; 0.7</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.8</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.9</italic> (%)</th>
<th align="center">
<italic>TC &#x3d;1.0</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.7</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.8</italic> (%)</th>
<th align="center">
<italic>TC &#x2265; 0.9</italic> (%)</th>
<th align="center">
<italic>TC &#x3d;1.0</italic> (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Pocket2Drug</td>
<td align="char" char=".">98.2</td>
<td align="char" char=".">95.2</td>
<td align="char" char=".">87.5</td>
<td align="char" char=".">77.1</td>
<td align="char" char=".">98.9</td>
<td align="char" char=".">96.8</td>
<td align="char" char=".">90.0</td>
<td align="char" char=".">80.5</td>
</tr>
<tr>
<td align="left">ZINC</td>
<td align="char" char=".">49.2</td>
<td align="char" char=".">18.4</td>
<td align="char" char=".">2.7</td>
<td align="char" char=".">0.2</td>
<td align="char" char=".">66.7</td>
<td align="char" char=".">36.3</td>
<td align="char" char=".">10.4</td>
<td align="char" char=".">2.3</td>
</tr>
<tr>
<td align="left">Vanilla RNN</td>
<td align="char" char=".">50.8</td>
<td align="char" char=".">16.1</td>
<td align="char" char=".">0.9</td>
<td align="char" char=".">0.0</td>
<td align="char" char=".">62.8</td>
<td align="char" char=".">28.8</td>
<td align="char" char=".">5.7</td>
<td align="char" char=".">0.9</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Two representative examples of pockets in the Pocket2Drug-lowhomol dataset are discussed in detail, a nucleotide binding site in the human mitogen and stress activated protein kinase 1 (MSK1) and a sugar binding site in <sc>d</sc>-allose binding protein (ALBP) from <italic>E</italic>. <italic>coli</italic>. MSK1 is involved in the regulation of mitogen activated kinases and it is required by the tumor-promoter-induced neoplastic cell transformation (<xref ref-type="bibr" rid="B34">Malakhova et&#x20;al., 2010</xref>). The complex structure of MSK1 and the phospho-amino-phosphonic acid-adenylate ester (AMP-PNP) (<xref ref-type="bibr" rid="B34">Malakhova et&#x20;al., 2010</xref>) was chosen as the target. AMP-PNP is a competitive ATPase inhibitor blocking the ATP-dependent oxidative phosphorylation (<xref ref-type="bibr" rid="B31">Lardy et&#x20;al., 1975</xref>). <xref ref-type="fig" rid="F4">Figure&#x20;4A</xref> shows the distribution of TC similarities between the label ligand, AMP-PNP, and molecules generated by Pocket2Drug and two baseline methods. Although most virtual molecules have relatively low TC similarities to AMP-PNP, more molecules with high TC vales are sampled from the Pocket2Drug model compared to ZINC and vanilla RNN. According to the Fisher-Pitman permutation test (<xref ref-type="bibr" rid="B38">Neuh&#xe4;user and Manly, 2004</xref>), the difference between Pocket2Drug and vanilla RNN is statistically significant with a <italic>p</italic>-value close to 0 and that between Pocket2Drug and ZINC is insignificant with a <italic>p</italic>-value of&#x20;0.1.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Chemical similarity of molecules generated by Pocket2Drug to label ligands. Label ligands are molecules bound to target pockets in experimental complex structures, <bold>(A)</bold> AMP-PNP binding to MSK1 and <bold>(B)</bold> <italic>&#xdf;</italic>-<sc>d</sc>-allose binding to ALBP. Chemical similarity is measured with the Tanimoto coefficient (TC).</p>
</caption>
<graphic xlink:href="fphar-13-837715-g004.tif"/>
</fig>
<p>To better understand the biological relevance of molecules generated by Pocket2Drug, five representative compounds with TC similarities against AMP-PNP ranging from 1.0 to 0.8 are presented in <xref ref-type="fig" rid="F5">Figure&#x20;5</xref>. <xref ref-type="fig" rid="F5">Figure&#x20;5A</xref> shows AMP-PNP, which is a nonhydrolyzable ATP analogue forming hydrogen bonds with MSK1 pocket residues through several moieties, NH<sub>2</sub> in adenine, 3&#x2032; OH in pentose sugar, OH in <italic>&#xdf;</italic>-phosphate, NH linking <italic>&#xdf;</italic>- and &#x3b3;-phosphates and OH in &#x3b3;-phosphate in the complex crystal structure (<xref ref-type="bibr" rid="B34">Malakhova et&#x20;al., 2010</xref>). Interestingly, several molecules generated by Pocket2Drug have common substructures with either substitutions in the adenine moiety (<xref ref-type="fig" rid="F5">Figures 5E,F</xref>) and the terminal phosphate group (<xref ref-type="fig" rid="F5">Figure&#x20;5B</xref>) or sharing the PNP subunit (<xref ref-type="fig" rid="F5">Figures 5C,D</xref>). These virtual molecules contain groups forming important hydrogen bonds with MSK1 pocket residues. To further evaluate the possibility of binding, all molecules were docked into the AMP-PNP pocket of MSK1 with fkcombu (<xref ref-type="bibr" rid="B28">Kawabata and Nakamura, 2014</xref>). The docking scores of the generated molecules are 12.5, 18.1, 21.8, 17.6, and 13.0 (<xref ref-type="fig" rid="F5">Figures 5B&#x2013;F</xref>, respectively). These results indicate that molecules generated by Pocket2Drug dock favorably to the target pocket with the compound shown in <xref ref-type="fig" rid="F5">Figures 5B,G</xref> having the best docking score due to the substitution in <italic>&#xdf;</italic>-phosphate&#x20;group.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Examples of molecules generated by Pocket2Drug for a binding site in MSK1. <bold>(A)</bold> The label ligand, AMP-PNP. <bold>(B&#x2013;F)</bold> Molecules constructed by Pocket2Drug with maximum common substructures to the label ligand highlighted in cyan. <bold>(G)</bold> Molecule shown in <bold>B</bold> (ice blue) docked to the binding site in MSK1 (orange).</p>
</caption>
<graphic xlink:href="fphar-13-837715-g005.tif"/>
</fig>
<p>The improvement of Pocket2Drug over baseline methods is even more perceptible for ALBP where the distribution of TC similarities to the label ligand is shifted toward much higher values for molecules sampled from the Pocket2Drug model (<xref ref-type="fig" rid="F4">Figure&#x20;4B</xref>). Differences between Pocket2Drug and both baseline methods are statistically significant with <italic>p</italic>-values close to 0. ALBP is a member of the ATP-binding cassette (ABC) transporter family facilitating the import and export of various molecules across the cell membrane (<xref ref-type="bibr" rid="B14">Fath and Kolter, 1993</xref>). ALBP binds <italic>&#xdf;</italic>-<sc>d</sc>-allose, shown in <xref ref-type="fig" rid="F6">Figure&#x20;6A</xref>, with a <italic>K</italic>
<sub>d</sub> of 0.33&#xa0;&#x3bc;<sc>m</sc> (<xref ref-type="bibr" rid="B9">Chaudhuri et&#x20;al., 1999</xref>). In the crystal complex structure, <italic>&#xdf;</italic>-<sc>d</sc>-allose forms multiple interactions with the pocket residues of ALBP through the ring oxygen and five hydroxyl moieties (<xref ref-type="bibr" rid="B9">Chaudhuri et&#x20;al., 1999</xref>). Selected compounds generated by Pocket2Drug are presented in <xref ref-type="fig" rid="F6">Figures 6B&#x2013;F</xref>. In addition to a substituted cyclohexane (<xref ref-type="fig" rid="F6">Figure&#x20;6B</xref>), several substituted allose molecules (<xref ref-type="fig" rid="F6">Figures 6C&#x2013;F</xref>) sharing a high chemical similarity with the label ligand, <italic>&#xdf;</italic>-<sc>d</sc>-allose (<xref ref-type="fig" rid="F6">Figure&#x20;6A</xref>), were constructed. Most of these molecules dock well to ALBP pocket with docking scores of 4.1, 3.7, 20.9, 3.5, and 9.8 for compounds shown in <xref ref-type="fig" rid="F6">Figures 6B&#x2013;F</xref>, respectively. Interestingly, a substituted cyclohexane in the molecule shown in <xref ref-type="fig" rid="F6">Figure&#x20;6B</xref> adopts the chair conformation similarly to <italic>&#xdf;</italic>-<sc>d</sc>-allose bound to ALBP in the experimental complex structure. A compound shown in <xref ref-type="fig" rid="F6">Figures 6E,G</xref> has the best docking score, whereas that shown in <xref ref-type="fig" rid="F6">Figure&#x20;6D</xref> has less favorable docking score than those ligands having a comparable size to <italic>&#xdf;</italic>-<sc>d</sc>-allose because of the large substitution at 5&#x2032; position that does not fit in the binding pocket of ALBP. Docking results suggest that molecules generated by Pocket2Drug are capable of forming favorable interactions with the target pocket.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Examples of molecules generated by Pocket2Drug for a binding site in ALBP. <bold>(A)</bold> The label ligand, <italic>&#xdf;</italic>-<sc>d</sc>-allose. <bold>(B&#x2013;F)</bold> Molecules constructed by Pocket2Drug with maximum common substructures to the label ligand highlighted in cyan. <bold>(G)</bold> Molecule shown in <bold>E</bold> (ice blue) docked to the binding site in ALBP (orange).</p>
</caption>
<graphic xlink:href="fphar-13-837715-g006.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>Evaluation by Pocket Structure Alignments</title>
<p>In addition to the assessment by ligand chemical similarity described above, the performance of Pocket2Drug is also evaluated with pocket structure alignments. This approach is based on an assumption that a molecule generated for the target pocket is a hit if a similar molecule binds to a site that is structurally similar to the target pocket (<xref ref-type="bibr" rid="B17">Govindaraj and Brylinski, 2018</xref>; <xref ref-type="bibr" rid="B15">Gaieb et&#x20;al., 2019</xref>). A flowchart of the evaluation procedure is shown in <xref ref-type="fig" rid="F7">Figure&#x20;7</xref>. For a target pocket in the testing set (<xref ref-type="fig" rid="F7">Figure&#x20;7A</xref>), molecules generated by Pocket2Drug are ranked according to their frequencies and 100 of the most frequent molecules are selected. For each drug candidate (<xref ref-type="fig" rid="F7">Figure&#x20;7B</xref>), chemically similar ligands with a TC of &#x2265;0.7 are identified in the PubChem BioAssay dataset comprising 73,021 active interactions involving 919 unique proteins and 17,367 unique compounds (<xref ref-type="bibr" rid="B51">Wang et&#x20;al., 2012</xref>). Next, the experimental complex structures of these ligands bound to similar proteins with a sequence identity of &#x2265;70% to PubChem BioAssay targets are retrieved from the PDB. The extracted binding sites (<xref ref-type="fig" rid="F7">Figure&#x20;7C</xref>) are finally structurally aligned to the initial target pocket with PocketAlign, an accurate method to superpose ligand binding sites in a sequence order-independent manner (<xref ref-type="bibr" rid="B59">Yeturu and Chandra, 2011</xref>). Essentially, this procedure validates molecules generated for target pockets by finding similar interactions that have already been determined experimentally through binding assays and protein crystallography.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Flowchart of the evaluation by pocket structure alignments. For a target pocket <bold>(A)</bold>, a molecule is generated by Pocket2Drug <bold>(B)</bold>. This compound is then scanned through the PubChem BioAssay for similar molecules for which experimental complex structures are available in the Protein Data Bank. The extracted binding site <bold>(C)</bold> corresponding to the know interaction in PubChem BioAssay is structurally aligned to the target pocket by PocketAlign. A high-quality alignment <bold>(D)</bold> indicates that the generated molecule is likely to bind to the target pocket.</p>
</caption>
<graphic xlink:href="fphar-13-837715-g007.tif"/>
</fig>
<p>Similar to the evaluation protocol by ligand chemical similarity, Pocket2Drug is compared to ZINC and vanilla RNN. For each target pocket, 100 molecules from the ZINC database and 100 molecules generated by vanilla RNN are selected so that their molecular weight distributions match those calculated for compounds selected by Pocket2Drug. In terms of statistics, the number of pocket pairs used as input for structure alignments is 17,620 for Pocket2Drug, 6,307 for ZINC, and 6,694 for vanilla RNN. The number of valid pocket&#x20;alignments constructed by PocketAlign (<xref ref-type="bibr" rid="B59">Yeturu and Chandra, 2011</xref>) are 16,987 (Pocket2Drug), 741 (ZINC), and 4,902 (vanilla RNN). A valid pocket&#x20;alignment has the RMSD of &#x2264;2&#xa0;&#xc5;; higher RMSD values indicate that two pockets are structurally dissimilar. According to this criterion, as many as 96.4% of validation pairs of pockets identified using output molecules generated by Pocket2Drug produce valid structure alignments, while these percentages are notably lower for ZINC (11.7%) and vanilla RNN (73.2%). The distribution of the RMSD scores of pocket&#x20;alignments for all tested methods is presented in <xref ref-type="fig" rid="F8">Figure&#x20;8</xref>. Not only using molecules selected by Pocket2Drug results in the highest percentage of valid structure alignments, but also RMSD values for these superpositions are generally much lower compared to ZINC and vanilla RNN. The mean RMSD scores for pocket2Drug, ZINC, and vanilla RNN are 1.1&#xa0;&#xc5;, 1.6&#xa0;&#xc5;, and 1.6&#xa0;&#xc5;, respectively.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Assessment of the quality of pocket&#x20;alignments constructed with PocketAlign. Alignment quality is evaluated by the root-mean-square deviation (RMSD) calculated over non-hydrogen atoms of binding residues. Target pockets are aligned to binding sites identified in the Protein Data Bank for molecules generated by Pocket2Drug (green) and two baselines, ZINC (red) and vanilla RNN (gray).</p>
</caption>
<graphic xlink:href="fphar-13-837715-g008.tif"/>
</fig>
<p>Structure alignment results demonstrate that for a large number of molecules generated by Pocket2Drug for target pockets, there are experimentally determined interactions between chemically similar ligands binding to structurally similar pockets. Two representative cases are selected to exemplify the evaluation by pocket structure alignments. The first target pocket is a nucleotide binding site in MSK1 used in the previous section to illustrate the results of the evaluation by ligand chemical similarity. Among molecules generated by Pocket2Drug, a compound ranked 12 with the frequency of 21 (<xref ref-type="fig" rid="F9">Figure&#x20;9A</xref>) is chemically similar to midostaurin (PubChem-CID: 9829523, <xref ref-type="fig" rid="F9">Figure&#x20;9B</xref>), a protein kinase C (PKC) inhibitor (<xref ref-type="bibr" rid="B11">Eder et&#x20;al., 2004</xref>) used to treat systemic mastocytosis, acute myeloid leukemia, and mast cell leukemia (<xref ref-type="bibr" rid="B37">National Cancer Institute Dictionary, 2021</xref>). According to the bioassay data (PubChem-BAID: 208295368), midostaurin inhibits PKC-&#x3b1; isoform with the half-maximal inhibitory concentration (IC<sub>50</sub>) of 22&#xa0;n<sc>m</sc> (<xref ref-type="bibr" rid="B35">Millward et&#x20;al., 2006</xref>). Midostaurin has been co-crystalized with the human dual specificity tyrosine-phosphorylation-regulated kinase 1A (DYRK1A, 25% sequence identity with PKC-&#x3b1;) with the equilibrium dissociation constant (<italic>K</italic>
<sub>d</sub>) of 100&#xa0;n<sc>m</sc> (PDB-ID: 4nct) (<xref ref-type="bibr" rid="B1">Alexeeva et&#x20;al., 2015</xref>). <xref ref-type="fig" rid="F9">Figure&#x20;9C</xref> shows the structure alignment constructed by PocketAlign between AMP-PNP binding pocket in MSK1 and midostaurin binding pocket in DYRK1A. Despite a low global sequence identity between these proteins of only 26%, their binding pockets are structurally highly similar with the RMSD of 0.90&#xa0;&#xc5;. The compound generated by Pocket2Drug docks to the AMP-PNP binding pocket in MSK1 with a score of 58.5 (<xref ref-type="fig" rid="F9">Figure&#x20;9D</xref>).</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Example of the evaluation by pocket&#x20;alignment for a binding site in MSK1. <bold>(A)</bold> A molecule generated by Pocket2Drug at rank 12. <bold>(B)</bold> A similar compound, midostaurin, with the maximum common substructure to Pocket2Drug molecule highlighted in cyan. <bold>(C)</bold> A structure alignment between the target binding site in MSK1 (orange) and midostaurin binding pocket in DYRK1A (purple). <bold>(D)</bold> The molecule generated by Pocket2Drug (ice blue) docked to the target site in MSK1 (orange) with fkcombu.</p>
</caption>
<graphic xlink:href="fphar-13-837715-g009.tif"/>
</fig>
<p>The second example is the human angiopoietin-1 receptor (Tie-2), an enzyme involved in vessel remodeling, branching, stability, and maturation (<xref ref-type="bibr" rid="B60">Yu, 2005</xref>). Using the binding site of Tie-2 as the input, Pocket2Drug generated a molecule shown in <xref ref-type="fig" rid="F10">Figure&#x20;10A</xref> at rank 9 with a frequency of 5. This compound is chemically similar to doramapimod (PubChem-CID: 156422, <xref ref-type="fig" rid="F10">Figure&#x20;10B</xref>), an inhibitor of ephrin type-A receptor 2 (EphA2) with a TC of 0.73. According to the bioassay data (PubChem-BAID: 40394839), doramapimod binds to EphA2 with a <italic>K</italic>
<sub>d</sub> of 0.37&#xa0;n<sc>m</sc> and has been tested for its anti-proliferative activity in the SF-268 cell line. It inhibits the viability of EphA2 growth dependent glioblastoma cells with a half-maximal effective concentration (EC<sub>50</sub>) of 5&#xa0;&#x3bc;<sc>m</sc> (<xref ref-type="bibr" rid="B20">Heinzlmeir et&#x20;al., 2017</xref>). Despite a low global sequence identity of 37%, the structure alignment of binding sites in Tie-2 (PDB-ID: 2oo8) and EphA2 (PDB-ID: 5nkd) yields an RMSD of 0.95&#xa0;&#xc5; (<xref ref-type="fig" rid="F10">Figure&#x20;10C</xref>). Docking simulations with fkcombu confirmed that the molecule generated by Pocket2Drug fits well into the binding site of Tie-2 with a score of 24.3 (<xref ref-type="fig" rid="F10">Figure&#x20;10D</xref>).</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Example of the evaluation by pocket&#x20;alignment for a binding site in Tie-2. <bold>(A)</bold> A molecule generated by Pocket2Drug at rank 9. <bold>(B)</bold> A similar compound, doramapimod, with the maximum common substructure to Pocket2Drug molecule highlighted in cyan. <bold>(C)</bold> A structure alignment between the target binding site in Tie-2 (orange) and doramapimod binding pocket in EphA2 (purple). <bold>(D)</bold> The molecule generated by Pocket2Drug (ice blue) docked to the target site in Tie-2 (orange) with fkcombu.</p>
</caption>
<graphic xlink:href="fphar-13-837715-g010.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>Discussion</title>
<p>In this communication, we describe Pocket2Drug, a novel deep learning model employing an encoder-decoder architecture to predict binding molecules for a ligand binding site. Pocket2Drug was trained in an end-to-end supervised manner against a large collection of ligand-pocket pairs. The analysis of molecules generated by Pocket2Drug using two evaluation protocols based on ligand chemical similarity and pocket structure alignments revealed that this algorithm significantly improves the chances of finding binding ligands compared to traditional techniques. Pocket2Drug not only yields a high accuracy against ligand-free structures, but it also generalizes well to unseen data, <italic>viz</italic>. those pockets extracted from proteins that are different from training instances. These findings are particularly important in drug discovery against novel protein structures, where it can help significantly reduce the search space of drug candidates. In contrast to traditional virtual screening typically employing a library of 200,000 to over 1,000,000 molecules (<xref ref-type="bibr" rid="B22">Hughes et&#x20;al., 2011</xref>), Pocket2Drug generates molecules that have high chances to bind to target pockets within a smaller sample of 81,920 compounds. Therefore, it can potentially decrease the number of molecules to be subjected to structure-based virtual screening from millions to tens of thousands.</p>
<p>Pocket2Drug can be improved by incorporating reinforcement learning imposing additional restraints on the synthetic accessibility, solubility, and toxicity of generated molecules, depending on a specific application. Additional improvements can also be achieved by applying a framework similar to the conditional recurrent neural network (cRNN), utilizing the RNN with the prior information (<xref ref-type="bibr" rid="B57">Xu et&#x20;al., 2021</xref>), to the heterogeneous input data. In contrast to cRNN, in which the pre-computed information is used as the prior condition for RNN, Pocket2Drug is an end-to-end DNN, therefore the encoder is updated during training. Another difference is the data representation; cRNN uses a voxel representation as the prior information, whereas Pocket2Drug employs a computationally more efficient graph representation. Nonetheless, the heterogeneous pocket data can be combined by concatenating embedding vectors generated by different feature extractors in order to provide the prior information on ligand binding&#x20;sites.</p>
<p>An attention mechanism was shown to significantly improve the performance of image captioning because it helps the model capture more semantically meaningful parts of images (<xref ref-type="bibr" rid="B55">Xu et&#x20;al., 2015</xref>). We expect that the same methodology can be implemented in Pocket2Drug since pocket residues contribute differently to the formation of molecular interactions with binding ligands. These are examples of future research directions that will be explored to further improve the performance of Pocket2Drug in the discovery of novel biopharmaceuticals.</p>
</sec>
</body>
<back>
<sec id="s5">
<title>Data Availability Statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: <ext-link ext-link-type="uri" xlink:href="https://github.com/shiwentao00/Pocket2Drug">https://github.com/shiwentao00/Pocket2Drug</ext-link>, <ext-link ext-link-type="uri" xlink:href="https://osf.io/qacwj/">https://osf.io/qacwj/</ext-link>.</p>
</sec>
<sec id="s6">
<title>Author Contributions</title>
<p>Conceptualization: WS; Methods: WS and LP; Dataset: MB; Evaluation and case studies: WS, MS, and GS; Supervision: MB; Funding requisition: JR and MB; Manuscript draft: WS, LP, MS, and GS; Final manuscript:&#x20;MB.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>This work has been supported in part by the National Institute of General Medical Sciences of the National Institutes of Health award R35GM119524, the US National Science Foundation award CCF1619303, the Louisiana Board of Regents contract LEQSF(2016-19)-RD-B03 and by the Center for Computation and Technology, Louisiana State University.</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alexeeva</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>&#xc5;berg</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Engh</surname>
<given-names>R. A.</given-names>
</name>
<name>
<surname>Rothweiler</surname>
<given-names>U.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>The Structure of a Dual-Specificity Tyrosine Phosphorylation-Regulated Kinase 1A-Pkc412 Complex Reveals Disulfide-Bridge Formation with the Anomalous Catalytic Loop HRD(HCD) Cysteine</article-title>. <source>Acta Crystallogr. D Biol. Crystallogr.</source> <volume>71</volume> (<issue>Pt 5</issue>), <fpage>1207</fpage>&#x2013;<lpage>1215</lpage>. <pub-id pub-id-type="doi">10.1107/S1399004715005106</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ali</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Hassan</surname>
<given-names>M. I.</given-names>
</name>
<name>
<surname>Islam</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ahmad</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>A Review of Methods Available to Estimate Solvent-Accessible Surface Areas of Soluble Proteins in the Folded and Unfolded States</article-title>. <source>Curr. Protein Pept. Sci.</source> <volume>15</volume> (<issue>5</issue>), <fpage>456</fpage>&#x2013;<lpage>476</lpage>. <pub-id pub-id-type="doi">10.2174/1389203715666140327114232</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Altschul</surname>
<given-names>S. F.</given-names>
</name>
<name>
<surname>Gish</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Miller</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Myers</surname>
<given-names>E. W.</given-names>
</name>
<name>
<surname>Lipman</surname>
<given-names>D. J.</given-names>
</name>
</person-group> (<year>1990</year>). <article-title>Basic Local Alignment Search Tool</article-title>. <source>J.&#x20;Mol. Biol.</source> <volume>215</volume> (<issue>3</issue>), <fpage>403</fpage>&#x2013;<lpage>410</lpage>. <pub-id pub-id-type="doi">10.1016/S0022-2836(05)80360-2</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Baldi</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Nasr</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>When Is Chemical Similarity Significant? the Statistical Distribution of Chemical Similarity Scores and its Extreme Values</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>50</volume> (<issue>7</issue>), <fpage>1205</fpage>&#x2013;<lpage>1222</lpage>. <pub-id pub-id-type="doi">10.1021/ci100010v</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ben Lo</surname>
<given-names>J.&#x20;Z. T.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Chemical Similarity Networks for Drug Discovery</article-title>,&#x201d; in <source>Special Topics in Drug Discovery</source> (<publisher-name>Intech</publisher-name>), <fpage>53</fpage>&#x2013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.5772/65106</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Berman</surname>
<given-names>H. M.</given-names>
</name>
<name>
<surname>Battistuz</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Bhat</surname>
<given-names>T. N.</given-names>
</name>
<name>
<surname>Bluhm</surname>
<given-names>W. F.</given-names>
</name>
<name>
<surname>Bourne</surname>
<given-names>P. E.</given-names>
</name>
<name>
<surname>Burkhardt</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2002</year>). <article-title>The Protein Data Bank</article-title>. <source>Acta Crystallogr. D Biol. Crystallogr.</source> <volume>58</volume> (<issue>Pt 6 No 1</issue>), <fpage>899</fpage>&#x2013;<lpage>907</lpage>. <pub-id pub-id-type="doi">10.1107/s0907444902003451</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brylinski</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Feinstein</surname>
<given-names>W. P.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>eFindSite: Improved Prediction of Ligand Binding Sites in Protein Models Using Meta-Threading, Machine Learning and Auxiliary Ligands</article-title>. <source>J.&#x20;Comput. Aided Mol. Des.</source> <volume>27</volume> (<issue>6</issue>), <fpage>551</fpage>&#x2013;<lpage>567</lpage>. <pub-id pub-id-type="doi">10.1007/s10822-013-9663-5</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brylinski</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Skolnick</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>What Is the Relationship between the Global Structures of Apo and Holo Proteins</article-title>. <source>Proteins</source> <volume>70</volume> (<issue>2</issue>), <fpage>363</fpage>&#x2013;<lpage>377</lpage>. <pub-id pub-id-type="doi">10.1002/prot.21510</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chaudhuri</surname>
<given-names>B. N.</given-names>
</name>
<name>
<surname>Ko</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Jones</surname>
<given-names>T. A.</given-names>
</name>
<name>
<surname>Mowbray</surname>
<given-names>S. L.</given-names>
</name>
</person-group> (<year>1999</year>). <article-title>Structure of D-Allose Binding Protein from Escherichia coli Bound to D-Allose at 1.8 A Resolution</article-title>. <source>J.&#x20;Mol. Biol.</source> <volume>286</volume> (<issue>5</issue>), <fpage>1519</fpage>&#x2013;<lpage>1531</lpage>. <pub-id pub-id-type="doi">10.1006/jmbi.1999.2571</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Cho</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>van Merrienboer</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Gulcehre</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Bahdanau</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Bougares</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Schwenk</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). &#x201c;<article-title>Learning Phrase Representations Using RNN Encoder-Decoder for Statistical Machine Translation</article-title>,&#x201d; in <conf-name>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name>, <conf-loc>Doha, Qatar</conf-loc>, <conf-date>October 25&#x2013;29, 2014</conf-date>. </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Eder</surname>
<given-names>J.&#x20;P.</given-names>
<suffix>Jr.</suffix>
</name>
<name>
<surname>Garcia-Carbonero</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Clark</surname>
<given-names>J.&#x20;W.</given-names>
</name>
<name>
<surname>Supko</surname>
<given-names>J.&#x20;G.</given-names>
</name>
<name>
<surname>Puchalski</surname>
<given-names>T. A.</given-names>
</name>
<name>
<surname>Ryan</surname>
<given-names>D. P.</given-names>
</name>
<etal/>
</person-group> (<year>2004</year>). <article-title>A Phase I Trial of Daily Oral 4&#x27;- N -Benzoyl-Staurosporine in Combination with Protracted Continuous Infusion 5-fluorouracil in Patients with Advanced Solid Malignancies</article-title>. <source>Invest. New Drugs</source> <volume>22</volume> (<issue>2</issue>), <fpage>139</fpage>&#x2013;<lpage>150</lpage>. <pub-id pub-id-type="doi">10.1023/B:DRUG.0000011790.31292.ef</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ertl</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Schuffenhauer</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Estimation of Synthetic Accessibility Score of Drug-like Molecules Based on Molecular Complexity and Fragment Contributions</article-title>. <source>J.&#x20;Cheminform</source> <volume>1</volume> (<issue>1</issue>), <fpage>8</fpage>. <pub-id pub-id-type="doi">10.1186/1758-2946-1-8</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ertl</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Lewis</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Martin</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Polyakov</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2017</year>). <source>In Silico Generation of Novel, Drug-like Chemical Matter Using the LSTM Neural Network</source>. <comment>arXiv preprint arXiv:1712.07449</comment>. </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fath</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Kolter</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>1993</year>). <article-title>ABC Transporters: Bacterial Exporters</article-title>. <source>Microbiol. Rev.</source> <volume>57</volume> (<issue>4</issue>), <fpage>995</fpage>&#x2013;<lpage>1017</lpage>. <pub-id pub-id-type="doi">10.1128/mr.57.4.995-1017.1993</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gaieb</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Parks</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Amaro</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). <source>Evaluation of Binding Site Comparison Algorithms and Proteometric Machine Learning Models in the Detection of Protein Pockets Capable of Binding the Same Ligand</source>. <comment>ChemRxiv preprint ChemRxiv:9178136</comment>. </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gerry</surname>
<given-names>C. J.</given-names>
</name>
<name>
<surname>Schreiber</surname>
<given-names>S. L.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Chemical Probes and Drug Leads from Advances in Synthetic Planning and Methodology</article-title>. <source>Nat. Rev. Drug Discov.</source> <volume>17</volume> (<issue>5</issue>), <fpage>333</fpage>&#x2013;<lpage>352</lpage>. <pub-id pub-id-type="doi">10.1038/nrd.2018.53</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Govindaraj</surname>
<given-names>R. G.</given-names>
</name>
<name>
<surname>Brylinski</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Comparative Assessment of Strategies to Identify Similar Ligand-Binding Pockets in Proteins</article-title>. <source>BMC Bioinformatics</source> <volume>19</volume> (<issue>1</issue>), <fpage>91</fpage>. <pub-id pub-id-type="doi">10.1186/s12859-018-2109-2</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Graves</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2013</year>). <source>Generating Sequences with Recurrent Neural Networks</source>. <comment>arXiv preprint arXiv:1308.0850</comment>. </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gupta</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>M&#xfc;ller</surname>
<given-names>A. T.</given-names>
</name>
<name>
<surname>Huisman</surname>
<given-names>B. J.&#x20;H.</given-names>
</name>
<name>
<surname>Fuchs</surname>
<given-names>J.&#x20;A.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Generative Recurrent Networks for De Novo Drug Design</article-title>. <source>Mol. Inform.</source> <volume>37</volume> (<issue>1-2</issue>), <fpage>1700111</fpage>. <pub-id pub-id-type="doi">10.1002/minf.201700111</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Heinzlmeir</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lohse</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Treiber</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Kudlinzki</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Linhard</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Gande</surname>
<given-names>S. L.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Chemoproteomics-Aided Medicinal Chemistry for the Discovery of EPHA2 Inhibitors</article-title>. <source>ChemMedChem</source> <volume>12</volume> (<issue>12</issue>), <fpage>999</fpage>&#x2013;<lpage>1011</lpage>. <pub-id pub-id-type="doi">10.1002/cmdc.201700217</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hochreiter</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Schmidhuber</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>1997</year>). <article-title>Long Short-Term Memory</article-title>. <source>Neural Comput.</source> <volume>9</volume> (<issue>8</issue>), <fpage>1735</fpage>&#x2013;<lpage>1780</lpage>. <pub-id pub-id-type="doi">10.1162/neco.1997.9.8.1735</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hughes</surname>
<given-names>J.&#x20;P.</given-names>
</name>
<name>
<surname>Rees</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kalindjian</surname>
<given-names>S. B.</given-names>
</name>
<name>
<surname>Philpott</surname>
<given-names>K. L.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Principles of Early Drug Discovery</article-title>. <source>Br. J.&#x20;Pharmacol.</source> <volume>162</volume> (<issue>6</issue>), <fpage>1239</fpage>&#x2013;<lpage>1249</lpage>. <pub-id pub-id-type="doi">10.1111/j.1476-5381.2010.01127.x</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Irwin</surname>
<given-names>J.&#x20;J.</given-names>
</name>
<name>
<surname>Shoichet</surname>
<given-names>B. K.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>ZINC--a Free Database of Commercially Available Compounds for Virtual Screening</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>45</volume> (<issue>1</issue>), <fpage>177</fpage>&#x2013;<lpage>182</lpage>. <pub-id pub-id-type="doi">10.1021/ci049714&#x2b;</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jian</surname>
<given-names>J.&#x20;W.</given-names>
</name>
<name>
<surname>Elumalai</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pitti</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>C. Y.</given-names>
</name>
<name>
<surname>Tsai</surname>
<given-names>K. C.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>J.&#x20;Y.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Predicting Ligand Binding Sites on Protein Surfaces by 3-dimensional Probability Density Distributions of Interacting Atoms</article-title>. <source>PloS one</source> <volume>11</volume> (<issue>8</issue>), <fpage>e0160315</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0160315</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jim&#xe9;nez</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Doerr</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mart&#xed;nez-Rosell</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Rose</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>De Fabritiis</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>DeepSite: Protein-Binding Site Predictor Using 3D-Convolutional Neural Networks</article-title>. <source>Bioinformatics</source> <volume>33</volume> (<issue>19</issue>), <fpage>3036</fpage>&#x2013;<lpage>3042</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btx350</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kabsch</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>1976</year>). <article-title>A Solution for the Best Rotation to Relate Two Sets of Vectors</article-title>. <source>Acta Cryst. Sect A.</source> <volume>32</volume> (<issue>5</issue>), <fpage>922</fpage>&#x2013;<lpage>923</lpage>. <pub-id pub-id-type="doi">10.1107/s0567739476001873</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kawabata</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Build-up Algorithm for Atomic Correspondence between Chemical Structures</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>51</volume> (<issue>8</issue>), <fpage>1775</fpage>&#x2013;<lpage>1787</lpage>. <pub-id pub-id-type="doi">10.1021/ci2001023</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kawabata</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Nakamura</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>3D Flexible Alignment Using 2D Maximum Common Substructure: Dependence of Prediction Accuracy on Target-Reference Chemical Similarity</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>54</volume> (<issue>7</issue>), <fpage>1850</fpage>&#x2013;<lpage>1863</lpage>. <pub-id pub-id-type="doi">10.1021/ci500006d</pub-id> </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krenn</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>H&#xe4;se</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Nigam</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Friederich</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Aspuru-Guzik</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Self-Referencing Embedded Strings (SELFIES): A 100% Robust Molecular String Representation</article-title>. <source>Machine Learn. Sci. Techn.</source> <volume>1</volume> (<issue>4</issue>), <fpage>045024</fpage>. <pub-id pub-id-type="doi">10.1088/2632-2153/aba947</pub-id> </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Chemical Similarity Methods : A Tutorial Review</article-title>. <source>The Chem. educator</source> <volume>16</volume>, <fpage>46</fpage>&#x2013;<lpage>50</lpage>. <pub-id pub-id-type="doi">10.1333/s00897112344a</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lardy</surname>
<given-names>H. A.</given-names>
</name>
<name>
<surname>Schuster</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Ebel</surname>
<given-names>R. E.</given-names>
</name>
</person-group> (<year>1975</year>). <article-title>Exploring Sites on Mitochondrial ATPase for Catalysis, Regulation, and Inhibition</article-title>. <source>J.&#x20;Supramol Struct.</source> <volume>3</volume> (<issue>3</issue>), <fpage>214</fpage>&#x2013;<lpage>221</lpage>. <pub-id pub-id-type="doi">10.1002/jss.400030303</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yeh</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Chiang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jernigan</surname>
<given-names>R. L.</given-names>
</name>
<name>
<surname>Lustig</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Protein Sequence Entropy Is Closely Related to Packing Density and Hydrophobicity</article-title>. <source>Protein Eng. Des. Sel</source> <volume>18</volume> (<issue>2</issue>), <fpage>59</fpage>&#x2013;<lpage>64</lpage>. <pub-id pub-id-type="doi">10.1093/protein/gzi009</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mahn</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lienqueo</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Salgado</surname>
<given-names>J.&#x20;C.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Methods of Calculating Protein Hydrophobicity and Their Application in Developing Correlations to Predict Hydrophobic Interaction Chromatography Retention</article-title>. <source>J.&#x20;Chromatogr. A.</source> <volume>1216</volume> (<issue>10</issue>), <fpage>1838</fpage>&#x2013;<lpage>1844</lpage>. <pub-id pub-id-type="doi">10.1016/j.chroma.2008.11.089</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Malakhova</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>D&#x27;Angelo</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H. G.</given-names>
</name>
<name>
<surname>Kurinov</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Bode</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>The crystal Structure of the Active Form of the C-Terminal Kinase Domain of Mitogen- and Stress-Activated Protein Kinase 1</article-title>. <source>J.&#x20;Mol. Biol.</source> <volume>399</volume> (<issue>1</issue>), <fpage>41</fpage>&#x2013;<lpage>52</lpage>. <pub-id pub-id-type="doi">10.1016/j.jmb.2010.03.064</pub-id> </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Millward</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>House</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Bowtell</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Webster</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Olver</surname>
<given-names>I. N.</given-names>
</name>
<name>
<surname>Gore</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2006</year>). <article-title>The Multikinase Inhibitor Midostaurin (PKC412A) Lacks Activity in Metastatic Melanoma: a Phase IIA Clinical and Biologic Study</article-title>. <source>Br. J.&#x20;Cancer</source> <volume>95</volume> (<issue>7</issue>), <fpage>829</fpage>&#x2013;<lpage>834</lpage>. <pub-id pub-id-type="doi">10.1038/sj.bjc.6603331</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mouchlis</surname>
<given-names>V. D.</given-names>
</name>
<name>
<surname>Afantitis</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Serra</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Fratello</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Papadiamantis</surname>
<given-names>A. G.</given-names>
</name>
<name>
<surname>Aidinis</surname>
<given-names>V.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Advances in De Novo Drug Design: From Conventional to Machine Learning Methods</article-title>. <source>Int. J.&#x20;Mol. Sci.</source> <volume>22</volume> (<issue>4</issue>), <fpage>1676</fpage>. <pub-id pub-id-type="doi">10.3390/ijms22041676</pub-id> </citation>
</ref>
<ref id="B37">
<citation citation-type="web">
<collab>National Cancer Institute Dictionary</collab> (<year>2021</year>). <comment>Available from: <ext-link ext-link-type="uri" xlink:href="https://www.cancer.gov/publications/dictionaries/cancer-terms/def/n-benzoyl-staurosporine">https://www.cancer.gov/publications/dictionaries/cancer-terms/def/n-benzoyl-staurosporine</ext-link> (Accessed December 4, 2021)</comment>.</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Neuh&#xe4;user</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Manly</surname>
<given-names>B. F.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>The Fisher-Pitman Permutation Test when Testing for Differences in Mean and Variance</article-title>. <source>Psychol. Rep.</source> <volume>94</volume> (<issue>1</issue>), <fpage>189</fpage>&#x2013;<lpage>194</lpage>. <pub-id pub-id-type="doi">10.2466/pr0.94.1.189-194</pub-id> </citation>
</ref>
<ref id="B39">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>O&#x27;Boyle</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Dalke</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>DeepSMILES: An Adaptation of SMILES for Use in Machine-Learning of Chemical Structures</article-title>. <source>ChemRxiv</source>. <pub-id pub-id-type="doi">10.26434/chemrxiv.7097960.v1</pub-id> </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>&#xd6;zt&#xfc;rk</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>&#xd6;zg&#xfc;r</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Schwaller</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Laino</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ozkirimli</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Exploring Chemical Space Using Natural Language Processing Methodologies for Drug Discovery</article-title>. <source>Drug Discov. Today</source> <volume>25</volume> (<issue>4</issue>), <fpage>689</fpage>&#x2013;<lpage>705</lpage>. <pub-id pub-id-type="doi">10.1016/j.drudis.2020.01.020</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Paszke</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gross</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Massa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Lerer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bradbury</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chanan</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>PyTorch: An Imperative Style, High-Performance Deep Learning Library</article-title>,&#x201d; in <conf-name>Proceedings of the Thirty-third Conference on Neural Information Processing Systems (NeurIPS)</conf-name>, <conf-loc>Vancouver, BC</conf-loc>, <conf-date>December 8&#x2013;14, 2019</conf-date>. </citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Govindaraj</surname>
<given-names>R. G.</given-names>
</name>
<name>
<surname>Lemoine</surname>
<given-names>J.&#x20;M.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H. C.</given-names>
</name>
<name>
<surname>Brylinski</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>DeepDrug3D: Classification of Ligand-Binding Pockets in Proteins with a Convolutional Neural Network</article-title>. <source>Plos Comput. Biol.</source> <volume>15</volume> (<issue>2</issue>), <fpage>e1006718</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1006718</pub-id> </citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Segler</surname>
<given-names>M. H. S.</given-names>
</name>
<name>
<surname>Kogej</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tyrchan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Waller</surname>
<given-names>M. P.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Generating Focused Molecule Libraries for Drug Discovery with Recurrent Neural Networks</article-title>. <source>ACS Cent. Sci.</source> <volume>4</volume> (<issue>1</issue>), <fpage>120</fpage>&#x2013;<lpage>131</lpage>. <pub-id pub-id-type="doi">10.1021/acscentsci.7b00512</pub-id> </citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shi</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Lemoine</surname>
<given-names>J.&#x20;M.</given-names>
</name>
<name>
<surname>Shawky</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Singha</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Pu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>BionoiNet: Ligand-Binding Site Classification with Off-The-Shelf Deep Neural Network</article-title>. <source>Bioinformatics</source> <volume>36</volume> (<issue>10</issue>), <fpage>3077</fpage>&#x2013;<lpage>3083</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa094</pub-id> </citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shi</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Singha</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Pu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ramanujam</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Brylinski</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Graphsite: Ligand-Binding Site Classification Using Deep Graph Neural Network</article-title>. <source>bioRxiv</source>, <fpage>2021.12.06.471420</fpage>. </citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shou</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Massarweh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Osborne</surname>
<given-names>C. K.</given-names>
</name>
<name>
<surname>Wakeling</surname>
<given-names>A. E.</given-names>
</name>
<name>
<surname>Ali</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Weiss</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2004</year>). <article-title>Mechanisms of Tamoxifen Resistance: Increased Estrogen Receptor-HER2/neu Cross-Talk in ER/HER2-positive Breast Cancer</article-title>. <source>J.&#x20;Natl. Cancer Inst.</source> <volume>96</volume> (<issue>12</issue>), <fpage>926</fpage>&#x2013;<lpage>935</lpage>. <pub-id pub-id-type="doi">10.1093/jnci/djh166</pub-id> </citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Simonovsky</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Meyers</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>DeeplyTough: Learning Structural Comparison of Protein Binding Sites</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>60</volume> (<issue>4</issue>), <fpage>2356</fpage>&#x2013;<lpage>2366</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.9b00554</pub-id> </citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Trebosc</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Gartenmann</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>T&#xf6;tzl</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lucchini</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Schellhorn</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Pieren</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Dissecting Colistin Resistance Mechanisms in Extensively Drug-Resistant Acinetobacter Baumannii Clinical Isolates</article-title>. <source>mBio</source> <volume>10</volume> (<issue>4</issue>), <fpage>e01083</fpage>. <pub-id pub-id-type="doi">10.1128/mBio.01083-19</pub-id> </citation>
</ref>
<ref id="B50">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Vinyals</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Toshev</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Erhan</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Show and Tell: A Neural Image Caption Generator</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <conf-loc>Boston, MA</conf-loc>, <conf-date>June 8&#x2013;10, 2015</conf-date>, <fpage>3156</fpage>&#x2013;<lpage>3164</lpage>. <pub-id pub-id-type="doi">10.1109/cvpr.2015.7298935</pub-id> </citation>
</ref>
<ref id="B49">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Vinyals</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kudlur</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Order Matters: Sequence to Sequence for Sets</article-title>,&#x201d; in <conf-name>Proceedings of the International Conference on Learning Representations</conf-name>, <conf-loc>San Juan, Puerto Rico</conf-loc>, <conf-date>May 2&#x2013;4, 2016</conf-date>, <fpage>3156</fpage>&#x2013;<lpage>3164</lpage>. <pub-id pub-id-type="doi">10.1109/cvpr.2015.7298935</pub-id> <comment>arXiv preprint arXiv:1511.06391</comment>. </citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Suzek</surname>
<given-names>T. O.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>PubChem&#x27;s BioAssay Database</article-title>. <source>Nucleic Acids Res.</source> <volume>40</volume> (<issue>Database issue</issue>), <fpage>D400</fpage>&#x2013;<lpage>D412</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkr1132</pub-id> </citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Weininger</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>1988</year>). <article-title>SMILES, a Chemical Language and Information System. 1. Introduction to Methodology and Encoding Rules</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>28</volume> (<issue>1</issue>), <fpage>31</fpage>&#x2013;<lpage>36</lpage>. <pub-id pub-id-type="doi">10.1021/ci00057a005</pub-id> </citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>K. J.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>P. M.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Leung</surname>
<given-names>C. H.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>D. L.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Mimicking Strategy for Protein-Protein Interaction Inhibitor Discovery by Virtual Screening</article-title>. <source>Molecules</source> <volume>24</volume> (<issue>24</issue>), <fpage>4428</fpage>. <pub-id pub-id-type="doi">10.3390/molecules24244428</pub-id> </citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>How Significant Is a Protein Structure Similarity with TM-Score &#x3d; 0.5?</article-title> <source>Bioinformatics</source> <volume>26</volume> (<issue>7</issue>), <fpage>889</fpage>&#x2013;<lpage>895</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btq066</pub-id> </citation>
</ref>
<ref id="B55">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Ba</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kiros</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Cho</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Courville</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Salakhutdinov</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). &#x201c;<article-title>Show, Attend and Tell: Neural Image Caption Generation with Visual Attention</article-title>,&#x201d; in <conf-name>International conference on machine learning</conf-name> (<publisher-loc>Lille, France</publisher-loc>: <publisher-name>PMLR</publisher-name>). </citation>
</ref>
<ref id="B56">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sonobe</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Kawarabayashi</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Jegelkaet</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Representation Learning on Graphs with Jumping Knowledge Networks</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning</conf-name> (<publisher-loc>Stockholm, Sweden</publisher-loc>: <publisher-name>PMLR</publisher-name>). </citation>
</ref>
<ref id="B57">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ran</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>De Novo molecule Design through the Molecular Generative Model Conditioned by 3D Information of Protein Binding Sites</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>61</volume> (<issue>7</issue>), <fpage>3240</fpage>&#x2013;<lpage>3254</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.0c01494</pub-id> </citation>
</ref>
<ref id="B58">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yasonik</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Multiobjective De Novo Drug Design with Recurrent Neural Networks and Nondominated Sorting</article-title>. <source>J.&#x20;Cheminform</source> <volume>12</volume> (<issue>1</issue>), <fpage>14</fpage>&#x2013;<lpage>19</lpage>. <pub-id pub-id-type="doi">10.1186/s13321-020-00419-6</pub-id> </citation>
</ref>
<ref id="B59">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yeturu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Chandra</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>PocketAlign a Novel Algorithm for Aligning Binding Sites in Protein Structures</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>51</volume> (<issue>7</issue>), <fpage>1725</fpage>&#x2013;<lpage>1736</lpage>. <pub-id pub-id-type="doi">10.1021/ci200132z</pub-id> </citation>
</ref>
<ref id="B60">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>The Dynamic Roles of Angiopoietins in Tumor Angiogenesis</article-title>. <source>Future Oncol.</source> <volume>1</volume> (<issue>4</issue>), <fpage>475</fpage>&#x2013;<lpage>484</lpage>. <pub-id pub-id-type="doi">10.2217/14796694.1.4.475</pub-id> </citation>
</ref>
<ref id="B61">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Skolnick</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Scoring Function for Automated Assessment of Protein Structure Template Quality</article-title>. <source>Proteins</source> <volume>57</volume> (<issue>4</issue>), <fpage>702</fpage>&#x2013;<lpage>710</lpage>. <pub-id pub-id-type="doi">10.1002/prot.20264</pub-id> </citation>
</ref>
<ref id="B62">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Skolnick</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>TM-align: a Protein Structure Alignment Algorithm Based on the TM-Score</article-title>. <source>Nucleic Acids Res.</source> <volume>33</volume> (<issue>7</issue>), <fpage>2302</fpage>&#x2013;<lpage>2309</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gki524</pub-id> </citation>
</ref>
<ref id="B63">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zitnik</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Agrawal</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Leskovec</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Modeling Polypharmacy Side Effects with Graph Convolutional Networks</article-title>. <source>Bioinformatics</source> <volume>34</volume> (<issue>13</issue>), <fpage>i457</fpage>&#x2013;<lpage>i466</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bty294</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>