<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<?covid-19-tdm?>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">744170</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2021.744170</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Deep Learning Algorithms Achieved Satisfactory Predictions When Trained on a Novel Collection of Anticoronavirus Molecules</article-title>
<alt-title alt-title-type="left-running-head">Harigua-Souiai et&#x20;al.</alt-title>
<alt-title alt-title-type="right-running-head">Deep Learning for COVID-19 Therapeutics</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Harigua-Souiai</surname>
<given-names>Emna</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/690210/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Heinhane</surname>
<given-names>Mohamed Mahmoud</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Abdelkrim</surname>
<given-names>Yosser Zina</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1444719/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Souiai</surname>
<given-names>Oussama</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1163001/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Abdeljaoued-Tej</surname>
<given-names>Ines</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1193861/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Guizani</surname>
<given-names>Ikram</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/684754/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<label>
<sup>1</sup>
</label>Laboratory of Molecular Epidemiology and Experimental Pathology-LR16IPT04, Institut Pasteur de Tunis, Universit&#xe9; de Tunis El Manar, <addr-line>Tunis</addr-line>, <country>Tunisia</country>
</aff>
<aff id="aff2">
<label>
<sup>2</sup>
</label>Laboratory of BioInformatics BioMathematics and BioStatistics (BIMS)-LR20IPT09, Institut Pasteur de Tunis, University of Tunis El Manar, <addr-line>Tunis</addr-line>, <country>Tunisia</country>
</aff>
<aff id="aff3">
<label>
<sup>3</sup>
</label>Engineering School of Statistics and Information Analysis, University of Carthage, <addr-line>Ariana</addr-line>, <country>Tunisia</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/560727/overview">Xiangxiang Zeng</ext-link>, Hunan University, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/438005/overview">Laurent Emmanuel Dardenne</ext-link>, National Laboratory for Scientific Computing (LNCC), Brazil</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/711875/overview">Khanh N. Q. Le</ext-link>, Taipei Medical University, Taiwan</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Emna Harigua-Souiai, <email>harigua.emna@gmail.com</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Computational Genomics, a section of the journal Frontiers in Genetics</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>29</day>
<month>11</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>12</volume>
<elocation-id>744170</elocation-id>
<history>
<date date-type="received">
<day>19</day>
<month>07</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>30</day>
<month>09</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2021 Harigua-Souiai, Heinhane, Abdelkrim, Souiai, Abdeljaoued-Tej and Guizani.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Harigua-Souiai, Heinhane, Abdelkrim, Souiai, Abdeljaoued-Tej and Guizani</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these&#x20;terms.</p>
</license>
</permissions>
<abstract>
<p>Drug discovery and repurposing against COVID-19 is a highly relevant topic with huge efforts dedicated to delivering novel therapeutics targeting SARS-CoV-2. In this context, computer-aided drug discovery is of interest in orienting the early high throughput screenings and in optimizing the hit identification rate. We herein propose a pipeline for Ligand-Based Drug Discovery (LBDD) against SARS-CoV-2. Through an extensive search of the literature and multiple steps of filtering, we integrated information on 2,610 molecules having a validated effect against SARS-CoV and/or SARS-CoV-2. The chemical structures of these molecules were encoded through multiple systems to be readily useful as input to conventional machine learning (ML) algorithms or deep learning (DL) architectures. We assessed the performances of seven ML algorithms and four DL algorithms in achieving molecule classification into two classes: active and inactive. The Random Forests (RF), Graph Convolutional Network (GCN), and Directed Acyclic Graph (DAG) models achieved the best performances. These models were further optimized through hyperparameter tuning and achieved ROC-AUC scores through cross-validation of 85, 83, and 79% for RF, GCN, and DAG models, respectively. An external validation step on the FDA-approved drugs collection revealed a superior potential of DL algorithms to achieve drug repurposing against SARS-CoV-2 based on the dataset herein presented. Namely, GCN and DAG achieved more than 50% of the true positive rate assessed on the confirmed hits of a PubChem bioassay.</p>
</abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>artificial neural network</kwd>
<kwd>SARS-CoV-2</kwd>
<kwd>machine learning</kwd>
<kwd>graph convoluational networks</kwd>
<kwd>drug discovery and repurposing</kwd>
</kwd-group>
<contract-num rid="cn001">AID-OAA-A-11-00012</contract-num>
<contract-num rid="cn002">Cov2-anti-protease</contract-num>
<contract-sponsor id="cn001">National Academy of Sciences<named-content content-type="fundref-id">10.13039/100000209</named-content>
</contract-sponsor>
<contract-sponsor id="cn002">Institut Pasteur<named-content content-type="fundref-id">10.13039/501100003762</named-content>
</contract-sponsor>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Discovery and design of effective treatments against COVID-19 is actually an active research field. Tremendous efforts have been deployed worldwide to find new molecules with therapeutic potential against its pathogenic agent SARS-CoV-2 (<xref ref-type="bibr" rid="B51">Song et&#x20;al., 2021</xref>). The most forerunner achievements mainly consisted in drug repurposing attempts of previously described drugs able to affect the SARS-CoV such as chloroquine and its derivatives (<xref ref-type="bibr" rid="B58">Vincent et&#x20;al., 2005</xref>; <xref ref-type="bibr" rid="B41">Pastick et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B64">Yao et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B16">Galan et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B40">Moiseev et&#x20;al., 2021</xref>). Other antivirals or antibiotics were also assessed for their potential as COVID-19 therapeutics (<xref ref-type="bibr" rid="B45">Pillaiyar et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B25">Kelleni, 2021</xref>). Still, as of today, no candidates have been yet retained as a universal COVID-19 treatment (<xref ref-type="bibr" rid="B21">Hoffmann et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B11">Dragojevic Simic et&#x20;al., 2021</xref>). Various approaches were adopted, including computational methods toward a faster discovery of drugs, given the urge of the global sanitary situation.</p>
<p>Computational approaches may be split into two subcategories: Structure-Based Drug Discovery (SBDD) and Ligand-Based Drug Discovery (LBDD). For SBDD, the structure of a molecular target is used to perform virtual screenings of large chemical libraries. The most popular targets are the Spike protein, known as the S protein, the 3-Chymotrypsin-Like cysteine protease (3CLpro), also called the main protease (Mpro), and the Papain-Like protease (PLpro) (<xref ref-type="bibr" rid="B7">Chellapandi and Saranya, 2020</xref>; <xref ref-type="bibr" rid="B53">Trezza et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B67">Zhang et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B66">Zhai et&#x20;al., 2021</xref>). These approaches rely on the availability of structural data of SARS-CoV-2 proteins, which are noticeably abundant as compared to other organisms. In fact, as of July 14, 2021, the RCSB PDB database accounted for 446 structures of the S protein and its binding domains, 360 crystal structures of the 3CLpro, 35 for the PLpro, and 505 structures corresponding to other SARS-CoV-2 proteins (RCSB).</p>
<p>On the other hand, LBDD is more likely dependent on the availability of data on the biological activity of molecules. Machine learning (ML) approaches demonstrated their ability to predict the activity of novel molecules based on such data (<xref ref-type="bibr" rid="B3">Altae-Tran et&#x20;al., 2017</xref>; <xref ref-type="bibr" rid="B36">Lo et&#x20;al., 2018</xref>; <xref ref-type="bibr" rid="B56">Vamathevan et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B65">Zeng et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B28">Korkmaz, 2020</xref>). The underlying assumption is that chemically and topologically similar compounds may have similar bioactivities and targets (<xref ref-type="bibr" rid="B18">Gfeller et&#x20;al., 2014</xref>; <xref ref-type="bibr" rid="B50">Shi et&#x20;al., 2015</xref>; <xref ref-type="bibr" rid="B44">Perualila-Tan et&#x20;al., 2016</xref>). These approaches were extensively used in novel drug discovery (DD) and repurposing against COVID-19 (<xref ref-type="bibr" rid="B26">Keshavarzi Arshadi et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B5">Bung et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B63">Yang et&#x20;al., 2021</xref>). In fact, dedicated resources have been developed to facilitate and enhance international efforts toward DD against COVID-19. Namely, the COVID-19 Moonshot Consortium has deployed international efforts in tackling data collection and curation of molecules targeting the 3CLpro of SARS-CoV-2. Their approach allied with SBDD and LBDD techniques (<xref ref-type="bibr" rid="B1">Achdout et&#x20;al., 2020</xref>). In fact, data availability is a cornerstone in building reliable ML models. This being said, data in DD is often sparse, heterogeneous, noisy, or too few. Multiple efforts have been made to build ML algorithms able to deal with such limitations and achieve satisfactory predictions (<xref ref-type="bibr" rid="B12">Duran-Frigola et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B22">Irwin et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B62">Yang et&#x20;al., 2020</xref>).</p>
<p>Beyond COVID-19 research, ML and deep learning (DL) were applied to a variety of DD projects. Applications can be split into two types: 1) activity prediction through regression and 2) classification of molecules into classes, mostly active vs. inactive (<xref ref-type="bibr" rid="B48">Rifaioglu et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B56">Vamathevan et&#x20;al., 2019</xref>). ML algorithms are implemented and trained on binary or float values descriptors of a fixed length, generated using a chemical structure encoding system. The most popular encoding systems are either the physicochemical descriptors (molecular weight, H-bond donors, H-bond acceptors, rotatable bonds, etc.) or molecular fingerprints (<xref ref-type="bibr" rid="B23">Jing et&#x20;al., 2018</xref>). The latter correspond to a variety of algorithms that are able to capture topological features and properties within chemical structures. Most of them calculate a series of binary digits that encode the presence or the absence of particular substructures in the molecule. More recently, there was a rising interest in graph convolution networks as chemical structure encoding systems in the frame of DL applications in LBDD (<xref ref-type="bibr" rid="B39">Micheli, 2009</xref>; <xref ref-type="bibr" rid="B37">Lusci et&#x20;al., 2013</xref>; <xref ref-type="bibr" rid="B13">Duvenaud et&#x20;al., 2015</xref>; <xref ref-type="bibr" rid="B24">Kearnes et&#x20;al., 2016</xref>; <xref ref-type="bibr" rid="B3">Altae-Tran et&#x20;al., 2017</xref>). A molecular graph is the most common machine-readable representation (<xref ref-type="bibr" rid="B9">David et&#x20;al., 2020</xref>). Chemical representations in these schemes lie in mapping the atoms and bonds of a molecule into sets of nodes and edges. Spatial relationships between the nodes are then encoded through network embedding. This leads to a low-dimensional vector representation of the molecular graph, preserving both network topology structure and node content information (<xref ref-type="bibr" rid="B60">Wu et&#x20;al., 2020</xref>). Graph convolutional networks (GCN) apply then a series of convolution layers to construct the whole molecule encoder. Graphs have irregular designs and sizes; there is no spatial order attached to the nodes. As a result, traditional convolution on regular grid-like structures cannot be applied directly to graphs. In the literature, efforts have been made to generalize the convolution operator to non-Euclidean structured data, resulting in convolutional graph networks (CGNs). GCNs have emerged as the state-of-the-art encoding when it comes to DD (<xref ref-type="bibr" rid="B52">Sun et&#x20;al., 2020</xref>), especially when one seeks to extract features with respect to the data structure. This extraction is done automatically from raw inputs (<xref ref-type="bibr" rid="B29">Lavecchia, 2019</xref>). Duvenaud et&#x20;al. presented a graph convolution method to encode molecule structures using a differentiable neural network (NN) that generalizes fingerprint-based features <italic>via</italic> backpropagation on an undirected graph representation of the molecule (<xref ref-type="bibr" rid="B13">Duvenaud et&#x20;al., 2015</xref>). The authors demonstrated that applying graph convolution enhances property predictions as compared to conventional circular fingerprints. Kearnes et&#x20;al. also described a graph convolution approach that learns from a graph representation of the molecule while taking into account its structure and composition (<xref ref-type="bibr" rid="B24">Kearnes et&#x20;al., 2016</xref>).</p>
<p>Here, we present a dataset of molecules validated for their effects on SARS-CoV-2 and/or SARS-CoV through viral growth inhibition, cell-based, or enzymatic experiments. Data were collected through an extensive search of the literature and databases, curated and formatted for cheminformatics simulations toward LBDD against COVID-19. Chemical structures of the molecules were then encoded through multiple systems to be readily useful as input to conventional ML algorithms or for GCN. We run an extensive set of simulations under different splitting and formatting conditions of the data to identify the ML and DL algorithms that could achieve satisfactory results. Most promising models were then optimized, and their performances were validated through cross-validation. An external validation step was performed to assess the potential of these algorithms to achieve drug repurposing using experimental data on the FDA-approved drugs collection.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and Methods</title>
<sec id="s2-1">
<title>2.1 Data Collection</title>
<p>The data collection process included three distinct approaches. The first consisted in literature mining. We collected data on molecules described in peer-reviewed papers as anticoronavirus effectors. Two beta-coronavirus species were considered: SARS-CoV (Severe Acute Respiratory Syndrome Coronavirus, 2003) and SARS-Cov-2 (Severe Acute Respiratory Syndrome Coronavirus, 2019). The second approach consisted in retrieving data on molecules deposited in the RCSB PDB as cocrystals with SARS-CoV and SARS-CoV-2 proteins, mainly the 3CL-protease and the Papain-Like protease. When available, activity data on these cocrystallized inhibitors were fetched from corresponding scientific publications. The third approach consisted in retrieving data from bioassays deposited in the PubChem database (<xref ref-type="bibr" rid="B70">Kim et&#x20;al., 2020</xref>). Priority was given to bioassays targeting SARS-CoV-2 or related molecular targets, with a special interest in large bioassays on other coronaviruses. Data collected from these bioassays correspond to viral growth inhibition or cell-based tests targeting a given viral enzyme. In total, data from 10&#x20;COVID-19 bioassays were included. These were complemented by four bioassays targeting SARS-CoV. PubChem IDs of the bioassays, their types, and sizes are listed in <xref ref-type="sec" rid="s11">Supplementary Table S1</xref>. Bioassay datasets were then formatted to be merged with the literature dataset previously collected. Data collected on each molecule included chemical structure, name, chemical name if indicated, activity, target virus, and any additional information such as identifiers in the PubChem database, <italic>in&#x20;vitro</italic> IC<sub>50</sub> values, cellulo IC<sub>50</sub> values, and any other valuable biological data (<italic>in vivo</italic> EC<sub>50</sub>, inhibition rate at a given concentration, etc.). The chemical structure of the molecules was encoded using the Simplified Molecular Input Line Entry System (SMILES). For compounds with a graphical description of their structure in the literature, we used the Optical Structure Recognition Application (OSRA) tool (<xref ref-type="bibr" rid="B15">Filippov and Nicklaus, 2009</xref>) to correctly infer the corresponding SMILES. For compounds referred to in the literature using a common name, SMILES were directly retrieved from the PubChem database. Duplicates were removed using a similarity threshold of 97% based on the Tanimoto coefficient. Each molecule was assigned an activity status that can be &#x201c;active,&#x201d; &#x201c;inactive,&#x201d; or &#x201c;inconclusive.&#x201d; For molecules retrieved from PubChem bioassays, these status values were provided from the experimentalists&#x2019; data. For molecules fetched in the literature, these status values were deduced from the authors&#x2019; conclusions. For the molecules retrieved from the PDB records, these status values were assigned to &#x201c;active&#x201d; by default. In fact, we considered that the ability of a molecule to bind to a given protein receptor encloses valuable information on potential active moieties, although no biological activity is reported for these molecules. Any data point with inconclusive or blurry value was discarded for robustness&#x20;sake.</p>
</sec>
<sec id="s2-2">
<title>2.2 Datasets Construction</title>
<p>The benchmark datasets used herein were split using two different approaches. First, a random split with no consideration for chemical equilibration among the training, validation, and test sets was applied. Then, a scaffold split (<xref ref-type="bibr" rid="B46">Ramsundar et&#x20;al., 2019</xref>) was applied. The scaffold split method would cluster molecules based on the Murcko scaffold calculated using RDkit. Compounds with different scaffolds are placed into different sets (<xref ref-type="bibr" rid="B46">Ramsundar et&#x20;al., 2019</xref>). This significantly reduces the overlap of chemical scaffolds between the training and the test sets (<xref ref-type="bibr" rid="B46">Ramsundar et&#x20;al., 2019</xref>).</p>
<p>In addition, we tested how the size of the validation and test sets would affect the algorithms&#x2019; performances. Thus, we tested two scenarios: 80/10/10 and 60/20/20 split. An additional splitting method of the original dataset that permitted the generation of category-specific subsets for validation purposes was applied. Undersampling and oversampling were applied in order to obtain equilibrated datasets in each case. Undersampling consisted in reducing the inactive molecules subset to achieve equilibrated classes. Oversampling consisted in artificially generating additional SMILES of the active molecules in order to reach the inactive subset&#x20;size.</p>
</sec>
<sec id="s2-3">
<title>2.3 Molecular Structure Embedding</title>
<p>Based on the SMILES, we calculated either molecular fingerprints or graph convolution-based features that consist in binary or float values vectors to be used as input to the ML and DL algorithms, respectively. As fingerprints, we chose the extended-connectivity fingerprints with a radius of two atoms (ECFP4), also known as the circular Morgan fingerprints (<xref ref-type="bibr" rid="B49">Rogers and Hahn, 2010</xref>), to encode the molecule structures for ML algorithms. We used the RDkit library to generate 2,048-bit length ECFP4. Molecules with erroneous SMILES or chemistry were removed at this stage. We used these fingerprints to calculate the Tanimoto coefficient of similarity in a pairwise fashion. This metric consists of the fraction of the intersection over the union of the set of chemical substructures between two molecules. It is one of the most used to assess the chemical similarity between molecules (<xref ref-type="bibr" rid="B8">Chung et&#x20;al., 2019</xref>). As for the graph convolution-based features, depending on the DL architecture requirement, two featurizers were used:<list list-type="simple">
<list-item>
<p>&#x2022; The ConvMolFeat featurizer (<xref ref-type="bibr" rid="B13">Duvenaud et&#x20;al., 2015</xref>) to generate input for the Graph Convolutional (GraphConv) (<xref ref-type="bibr" rid="B13">Duvenaud et&#x20;al., 2015</xref>) and the Directed Acyclic Graph (DAG) (<xref ref-type="bibr" rid="B37">Lusci et&#x20;al., 2013</xref>) models.</p>
</list-item>
<list-item>
<p>&#x2022; The MolGraphConvFeat (<xref ref-type="bibr" rid="B24">Kearnes et&#x20;al., 2016</xref>) to generate input for the GAT (<xref ref-type="bibr" rid="B57">Velickovic et&#x20;al., 2018</xref>) and the GCN (<xref ref-type="bibr" rid="B27">Kipf and Welling, 2016</xref>) models.</p>
</list-item>
</list>
</p>
<p>Graphical convolutional models map molecules as undirected graphs whose vertices and edges represent individual atoms and bonds, respectively. Graphical convolutions extract meaningful patterns from basic descriptions of graph structure (atom and bond properties and graph distances) to form molecule-level representations. They are considered fully integrated approaches to virtual screening. The output of the model is invariant to the order in which the atom and bond information is encoded in the input. The graph represents class similarity information and is fed into DL classification models.</p>
</sec>
<sec id="s2-4">
<title>2.4 ML and DL Algorithms</title>
<p>We implemented multiple artificial intelligence (AI) algorithms to develop classification models: ML, ensemble learning methods (EL), and DL. We implemented seven ML algorithms, out of which two are simple ML algorithms, namely, Logistic Regression (LR) and Support Vector Machine (SVM). Five additional EL algorithms were implemented, namely, Random Forests (RF), Multitask Classifier (MTC), IRV-MTC, Robust MTC, and Gradient Boosting (XGBoost). EL are learning algorithms that construct the first set of classifiers and then construct a new one by taking a weighted vote of data predictions from the previous classifiers (<xref ref-type="bibr" rid="B10">Dietterich, 2000</xref>). These algorithms were implemented under Scikit-learn, an open-source python library (<xref ref-type="bibr" rid="B43">Pedregosa et&#x20;al., 2011</xref>). LR measures the relationship between a categorical dependent variable and one or more explanatory variables. This is performed by estimating probabilities using a logistic function, which is the cumulative logistic distribution, thus predicting the probability of certain outcomes. The SVM is one of the most popular supervised ML algorithms. It is effective in high-dimensional spaces. The hyperplane learning in the SVM algorithm can be performed using different kernel functions for the decision function. The RF method is an ensemble method, based on decision trees. The model fits on various subsamples of the dataset and uses averaging to improve predictive accuracy and control overfitting. The Gradient Boosting model implemented herein is called XGBoost (<xref ref-type="bibr" rid="B42">Paul et&#x20;al., 2020</xref>). It is an extremely gradient boosting algorithm and a decision tree-based boosting integration algorithm (<xref ref-type="bibr" rid="B14">Ericksen et&#x20;al., 2017</xref>). Further ensemble methods have been tested: Multitask Classifier (MTC), IRV-MTC, and Robust MTC. These are fully connected NN, where various hyperparameters are optimized. They operate like EL algorithms, where they integrate data from different tasks to achieve classification. When used on a single task data, they are a nonlinear classifier that performs repeated linear and nonlinear transformations on one single task (<xref ref-type="bibr" rid="B47">Ramsundar et&#x20;al., 2017</xref>).</p>
<p>Then, four DL architectures were implemented under the DeepChem library (<xref ref-type="bibr" rid="B46">Ramsundar et&#x20;al., 2019</xref>): the Graph Convolutional Model (GraphConv), the DAG model, the Graph Attention Networks model (GAT), and the GCN model. The GraphConv Model (<xref ref-type="bibr" rid="B13">Duvenaud et&#x20;al., 2015</xref>) learns a vector representing the compound from the graph-based representation of the molecule. It predicts the target value directly through graph convolution operations. Convolutional networks operate the same operation locally and globally and combine the information in a common pooling step. Feature extraction involves computing an initial feature vector and a list of neighbors for each atom. The feature vector summarizes the local chemical environment of the atom, including atomic types, hybridization types, and valence structures. The neighbor lists map the connectivity of the entire molecule and are then processed in each model to generate graph structures (<xref ref-type="bibr" rid="B61">Wu et&#x20;al., 2018</xref>). The DAG model is an ensemble of recursive NN that associate all vertex-centered acyclic orientations of the graph representation of the molecule. It is slightly dependent on the molecular descriptors since suitable representations are learned from the DAG representation (<xref ref-type="bibr" rid="B37">Lusci et&#x20;al., 2013</xref>). The graph attentional layer (GAT) model (<xref ref-type="bibr" rid="B57">Velickovic et&#x20;al., 2018</xref>) is a convolutional NN that operates on graph-structured data, taking advantage of self-attention hidden layers. The attention mechanism is applied in a shared manner to all edges of the graph and thus does not depend on prior access to the overall structure of the graph or to (characteristics of) all its nodes. It allows assigning (implicitly) different importance to the nodes of the same neighborhood. GCN is an implementation of graph convolutional NN (<xref ref-type="bibr" rid="B27">Kipf and Welling, 2016</xref>). It learns hidden layer representations that are able to encode both individual features of nodes and their respective environments. It computes a weighted sum of the node representations in the graph, where the weights are computed by applying a gating function to the node representations, and then applies a max pooling of the node representations. It perform the final prediction using a multilayer perceptron (MLP) over a concatenation of the last convolution layer output. It differs from the GraphConv model by the fact that, for each graph convolution, the learnable weight in this model is shared across all nodes. The GraphConv model computes separate learnable weights for&#x20;nodes.</p>
<p>Under the DeepChem library, both the GraphConv Model and the DAG model were implemented to learn from MolConv featurizer (<xref ref-type="bibr" rid="B13">Duvenaud et&#x20;al., 2015</xref>) that corresponds to GCN that learns from circular morgan fingerprints-like representation of the molecule. On the other hand, the GAT and GCN models have been implemented in a way that they can learn from the MolGraphConv featurizer (<xref ref-type="bibr" rid="B24">Kearnes et&#x20;al., 2016</xref>). Data were split into training, validation, and test sets. The hyperparameters of the DL models were tuned using the loss of the validation&#x20;sets.</p>
</sec>
<sec id="s2-5">
<title>2.5 Model Evaluation and Selection</title>
<p>We performed the first comparison of all models&#x2019; performances with hyperparameters set to the optimal values obtained through the MoleculeNet benchmarks (<xref ref-type="bibr" rid="B61">Wu et&#x20;al., 2018</xref>). To better evaluate the different models, we calculated multiple performance metrics, including the ROC-AUC, accuracy, F1-score, Matthews correlation coefficient (MCC) (<xref ref-type="bibr" rid="B38">Matthews, 1975</xref>), and Cohen&#x2019;s Kappa coefficient (<italic>&#x3ba;</italic>). Then, we performed a cross evaluation of the model performances when trained and tested on stratified subsets of the data based on the different categories of targets. Accuracy, F1-score, Recall, and specificity were used as evaluation metrics for these simulations.</p>
<p>For the metric definitions, the following abbreviations are used: the number of true positives (TP), the number of false positives (FP), the number of true negatives (TN), and the number of false negatives (FN). Specificity, also called the False Positive Rate (FPR), is the model&#x2019;s ability to correctly reject an inactive molecule. Specificity of a test is the proportion of molecules that are truly inactive, which are classified as is. It is defined as follows:<disp-formula id="e1">
<mml:math id="m1">
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">fi</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">t</mml:mi>
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>Model Recall can be thought of as the percentage of true class labels correctly identified by the model as true. It is equal to the model sensitivity in binary classification and is also called the True Positive Rate (TPR). It is defined as follows:<disp-formula id="e2">
<mml:math id="m2">
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>The F1-score is the harmonic mean of the Recall and precision:<disp-formula id="e3">
<mml:math id="m3">
<mml:mtext>F</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mo>-</mml:mo>
<mml:mtext>score</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>&#x2217;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mo>&#x2217;</mml:mo>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">s</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">s</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(3)</label>
</disp-formula>where precision is the probability of a predicted true label is predicted as true and is defined as follows:<disp-formula id="e4">
<mml:math id="m4">
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">s</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">o</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>Accuracy is the percentage of correctly identified labels out of the entire population.<disp-formula id="e5">
<mml:math id="m5">
<mml:mi mathvariant="normal">A</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">u</mml:mi>
<mml:mi mathvariant="normal">r</mml:mi>
<mml:mi mathvariant="normal">a</mml:mi>
<mml:mi mathvariant="normal">c</mml:mi>
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>The ROC-AUC score tells how much the model is capable of distinguishing between classes. It varies between 0 and 1, where 1 means a perfect prediction. The MMC is a correlation coefficient between the observed and predicted binary classifications. It is between &#x2212;1 and &#x2b;1, where &#x2b;1 indicates a perfect prediction, 0 indicates no better than random, and &#x2212;1 indicates prediction and observation are totally different.<disp-formula id="e6">
<mml:math id="m6">
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2217;</mml:mo>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2217;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2217;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2217;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2217;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>Cohen&#x2019;s Kappa method measures interclassifier agreement in qualitative classification tasks. It evaluates the agreement between two classifiers and takes into account the random occurrence of the agreement. A value close to one denotes better agreement between the results and ground truth.<disp-formula id="e7">
<mml:math id="m7">
<mml:mi>&#x3ba;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn mathvariant="normal">2</mml:mn>
<mml:mo>&#x2217;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2217;</mml:mo>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
<mml:mo>&#x2217;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2217;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2217;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="normal">T</mml:mi>
<mml:mi mathvariant="normal">N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>The best performers were then selected for hyperparameter optimization on the particular anticoronavirus dataset collected through the present study. Their performances were mainly assessed through ROC-AUC, F1-score, Recall, Accuracy, MCC, and Cohen&#x2019;s Kappa scores, which are a set of popular metrics in evaluating ML algorithms in a variety of applications (<xref ref-type="bibr" rid="B32">Le et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B30">Le and Huynh, 2019</xref>; <xref ref-type="bibr" rid="B31">Le and Nguyen, 2019</xref>). ML algorithm optimization included all optimizable parameters for the respective model. For DL architectures, the number of epochs, the batch size, the learning rate, the dropout, or the number of graph features when they apply were optimized. We selected the configuration that maximizes the ROC-AUC of the model on the validation set. The accuracy, the F1-score, the MCC, and Cohen&#x2019;s Kappa coefficient were also calculated for all combinations.</p>
<p>Tenfold cross-validation was performed, and the mean ROC-AUC, F1-score, and Recall values were reported. A stratified validation was also applied in order to assess the ability of the algorithms trained on the heterogeneous dataset to correctly predict active molecules from different categories of experiments. The sensitivity (Recall) and specificity were herein used as performance indicators. The optimized models were then subject to an external validation using an unseen set of molecules. We used a PubChem bioassay that consisted in a primary screen of 1,518&#x20;FDA-approved molecules against SARS-CoV-2-infected cells (AID_1409594). A total number of 17 hits were retained as potentially active molecules, and their antiviral efficacy was further confirmed through a second assay (AID_1409595). We performed a prediction of these 1,518&#x20;FDA-approved drugs as anti-SARS-CoV-2 inhibitors using the best performing algorithms.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<sec id="s3-1">
<title>3.1 Integration Efforts Led to a Curated Dataset of Anticoronavirus Molecules</title>
<p>We collected data on molecules with anticoronavirus effects, out of which 533 were retrieved from literature. All remaining compounds were collected from 14 PubChem bioassays. Since activity types were different from one source to the other, we considered the activity as a binary variable. Initially, four classes of activity status were listed: active, inactive, unspecified, and inconclusive. Only molecules within the first two classes were retained in the frame of the present work. The combined set of active and inactive molecules was subject to redundancy check, and duplicates were removed. The number of active molecules was equal to 1,305 at this stage. We then looked to obtain an equal number (1,305) of inactive molecules, which were in larger numbers, namely, within large bioassays. Thus, from some SARS-CoV bioassays, only a subset of inactive molecules was randomly selected (see <xref ref-type="sec" rid="s11">Supplementary Table S1</xref>). Ultimately, 2,610 nonredundant compounds were obtained. We performed a structural similarity analysis to assess the chemical diversity of the dataset (<xref ref-type="fig" rid="F1">Figure&#x20;1</xref>). Based on the circular Morgan fingerprints, we calculated the pairwise distance between all compounds using the Tanimoto similarity coefficient. The similarity distribution demonstrated too few values higher than 60%. This indicates a high chemical diversity within the dataset. Also, experiments that revealed these molecules included enzymatic activity assays against one of the viruses proteases 3CLpro and PLpro, inhibition assays targeting the whole virus, and cell-based assays. We defined most relevant experiment categories as follows: 3CLpro_cov, 3CLpro_cov2, PLpro-cov, PLpro_cov2, and viral_cov2. Each category presents a specific count in terms of active and inactive molecules (<xref ref-type="fig" rid="F1">Figure&#x20;1</xref>), revealing unbalanced and insufficient data within some categories. Within the molecules with known molecular targets, only 0.7% were targeting the PLpro of SARS-CoV-2, while 40.6% were targeting PLpro of SARS-CoV (<xref ref-type="fig" rid="F1">Figure&#x20;1</xref>). The remaining molecules were targeting the 3CLpro of SARS-CoV-2 (6.3%) and 3CLpro SARS-CoV (52.4%). This bias reflects the higher interest toward the 3CLpro as a therapeutic target against coronaviruses (<xref ref-type="bibr" rid="B63">Yang et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B66">Zhai et&#x20;al., 2021</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Anticoronavirus dataset composition. <bold>(A)</bold> Distribution of the pairwise chemical similarity among the molecules based on the Tanimoto coefficient. <bold>(B)</bold> Proportions of &#x201c;active&#x201d; and &#x201c;inactive&#x201d; molecules within each experimental category.</p>
</caption>
<graphic xlink:href="fgene-12-744170-g001.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>3.2 Graph Convolution-Based Models Compete With Baseline ML Algorithms</title>
<p>At this stage, we disposed of 2,610 anticoronavirus molecules. We used a random and a scaffold split of the dataset using two splitting proportions of the training, validation, and test sets as follows: 80/10/10 and 60/20/20. We seek to identify which scenario is overall optimal. The final datasets, ready for the upcoming experiments, are available on GitHub.</p>
<p>We first run preliminary simulations of seven ML algorithms and four DL algorithms using the hyperparameter values released by the MoleculeNet authors (<xref ref-type="bibr" rid="B61">Wu et&#x20;al., 2018</xref>). These optimized values were tuned on multiple types of datasets related to DD tasks. Test set representing 10% of the dataset derived significantly better results than test sets of size 20% (<xref ref-type="sec" rid="s11">Supplementary Table S2</xref>). This highlighted the need to keep the training set at its higher size in order to reach satisfying levels of training. Such proportions also demonstrated the highest scores of few-shot learning algorithms (<xref ref-type="bibr" rid="B35">Liu et&#x20;al., 2021</xref>).</p>
<p>To better understand to which extent the heterogeneity of our dataset may be influential, we considered a subset of homogeneous data from the largest PubChem bioassay on SARS-CoV within our dataset: AID_1706. It is a biochemical assay targeting the enzymatic activity of the 3CLpro of SARS-CoV, through which 290,893 compounds were tested. A total of 405 molecules showed an inhibitory effect on the 3CLpro-mediated peptide cleavage. Based on this bioassay, we generated one undersampled (810 molecules) and one oversampled (2,430 molecules) homogeneous datasets. On randomly split data, ROC-AUC scores on the heterogeneous dataset were the most stable across the different algorithms. The best results were exhibited by RF and SVM on the oversampled homogeneous dataset. For the DL algorithms, GraphConv model, DAG, and GCN demonstrated satisfying performances (<inline-formula id="inf1">
<mml:math id="m8">
<mml:mo>&#x3e;</mml:mo>
</mml:math>
</inline-formula>80%) on the oversampled and heterogeneous datasets, with comparable values. Overall, six out of eleven presented similar ROC-AUC scores between the heterogeneous and the oversampled homogeneous datasets (<xref ref-type="fig" rid="F2">Figure&#x20;2A</xref>). Noticeably, these datasets had comparable sizes and were larger than the undersampled homogeneous dataset. This confirms the sensitivity of the AI models&#x2019; performances to the dataset&#x2019;s size (<xref ref-type="bibr" rid="B62">Yang et&#x20;al., 2020</xref>).</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>ROC-AUC scores of all models for three different datasets (heterogeneous, undersampled homogeneous, and oversampled homogeneous). <bold>(A)</bold> ROC-AUC scores achieved by all models under the random 80/10/10 split. <bold>(B)</bold> ROC-AUC scores achieved by all models under the scaffold 80/10/10 split. <bold>(C)</bold> Boxplots of the ROC-AUC scores achieved by each model on all validation subsets (heterogeneous, undersampled homogeneous, and oversampled homogeneous included) and with both splitting proportions (80/10/10; 60/20/20). <bold>(D)</bold> Boxplots of the ROC-AUC scores achieved by each model on all test subsets (heterogeneous, undersampled homogeneous, and oversampled homogeneous included) and with both splitting proportions (80/10/10; 60/20/20).</p>
</caption>
<graphic xlink:href="fgene-12-744170-g002.tif"/>
</fig>
<p>On scaffold-based split datasets, ROC-AUC scores were lower than those obtained with the randomly split data (<xref ref-type="fig" rid="F2">Figure&#x20;2B</xref>). Moreover, the lowest values were observed for the oversampled homogeneous data, while the highest were obtained with the undersampled homogeneous data. The heterogeneous dataset achieved scores comparable to the undersampled dataset varying between 61 and 80%. This scheme was observed in overall simulations (<xref ref-type="fig" rid="F2">Figures 2C,D</xref>). The difference between scores obtained with the oversampled and the heterogeneous datasets, at equal sizes, indicated a lower chemical diversity (number of scaffolds) within the homogeneous dataset. Thus, scaffold splitting induced lower diversity across the train and the test sets, which points out the interest of using a random split of the heterogeneous dataset in building performing ML/DL models. For the upcoming simulations, we will report results on the heterogeneous dataset using an 80/10/10 random&#x20;split.</p>
<p>The scores of the training, the validation, and the test sets obtained with all splitting combinations showed little to no overfitting, as no significant differences were observed between these sets&#x2019; scores overall (<xref ref-type="table" rid="T1">Table&#x20;1</xref>). According to the ROC-AUC scores on the test set, RF and SVM were the best classifiers within the ML/EL algorithms (<xref ref-type="fig" rid="F2">Figure&#x20;2</xref>). Although the Multitask Classifier (MTC) and its variants IRV and Robust MTC exhibited higher Recall, they exhibited lower values of ROC-AUC and F1-score. We concluded that RF and SVM were the most likely to correctly predict the active molecules as being active. In the set of DL architectures, the DAG and the GCN models were the best performers. They both achieved ROC-AUC scores of 87%, F1-scores of 73 and 79%, and Recall values equal to 68 and 82%, respectively (<xref ref-type="table" rid="T1">Table&#x20;1</xref>). Noticeably, the DAG model had quite higher performances on the train set (99% for all metrics). This was not the case for the GCN. This indicated that the herein used hyperparameters for the DAG model were close to the optimal configuration for our case study. We may expect better results for the GCN algorithm after the optimization step. For the upcoming steps, we will consider the RF, the DAG, and the GCN models for hyperparameters tuning and optimization.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Performances of 11 algorithms in predicting activity class of the anticoronavirus dataset. Optimized settings based on the MoleculeNet benchmarks were considered for all models.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Model</th>
<th align="center">Train</th>
<th align="center">Validation</th>
<th align="center">Test</th>
<th align="center">Train</th>
<th align="center">Validation</th>
<th align="center">Test</th>
<th align="center">Train</th>
<th align="center">Validation</th>
<th align="center">Test</th>
</tr>
<tr>
<th align="center">ROC-AUC</th>
<th align="center">ROC-AUC</th>
<th align="center">ROC-AUC</th>
<th align="center">F1-score</th>
<th align="center">F1-score</th>
<th align="center">F1-score</th>
<th align="center">Recall</th>
<th align="center">Recall</th>
<th align="center">Recall</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">GraphConv</td>
<td align="char" char=".">0.99</td>
<td align="char" char=".">0.80</td>
<td align="char" char=".">0.86</td>
<td align="char" char=".">0.98</td>
<td align="char" char=".">0.75</td>
<td align="char" char=".">0.79</td>
<td align="char" char=".">0.98</td>
<td align="char" char=".">0.75</td>
<td align="char" char=".">0.80</td>
</tr>
<tr>
<td align="left">DAG</td>
<td align="char" char=".">0.99</td>
<td align="char" char=".">0.82</td>
<td align="char" char=".">0.87</td>
<td align="char" char=".">0.99</td>
<td align="char" char=".">0.72</td>
<td align="char" char=".">0.73</td>
<td align="char" char=".">0.98</td>
<td align="char" char=".">0.68</td>
<td align="char" char=".">0.68</td>
</tr>
<tr>
<td align="left">GAT</td>
<td align="char" char=".">0.75</td>
<td align="char" char=".">0.77</td>
<td align="char" char=".">0.82</td>
<td align="char" char=".">0.62</td>
<td align="char" char=".">0.65</td>
<td align="char" char=".">0.69</td>
<td align="char" char=".">0.54</td>
<td align="char" char=".">0.55</td>
<td align="char" char=".">0.61</td>
</tr>
<tr>
<td align="left">GCN</td>
<td align="char" char=".">0.94</td>
<td align="char" char=".">0.82</td>
<td align="char" char=".">0.87</td>
<td align="char" char=".">0.86</td>
<td align="char" char=".">0.75</td>
<td align="char" char=".">0.79</td>
<td align="char" char=".">0.88</td>
<td align="char" char=".">0.75</td>
<td align="char" char=".">0.82</td>
</tr>
<tr>
<td align="left">LR</td>
<td align="char" char=".">0.99</td>
<td align="char" char=".">0.81</td>
<td align="char" char=".">0.89</td>
<td align="char" char=".">0.97</td>
<td align="char" char=".">0.76</td>
<td align="char" char=".">0.82</td>
<td align="char" char=".">0.97</td>
<td align="char" char=".">0.77</td>
<td align="char" char=".">0.82</td>
</tr>
<tr>
<td align="left">SVM</td>
<td align="char" char=".">0.99</td>
<td align="char" char=".">0.86</td>
<td align="char" char=".">0.90</td>
<td align="char" char=".">0.97</td>
<td align="char" char=".">0.80</td>
<td align="char" char=".">0.82</td>
<td align="char" char=".">0.97</td>
<td align="char" char=".">0.79</td>
<td align="char" char=".">0.82</td>
</tr>
<tr>
<td align="left">RF</td>
<td align="char" char=".">0.99</td>
<td align="char" char=".">0.86</td>
<td align="char" char=".">0.90</td>
<td align="char" char=".">0.99</td>
<td align="char" char=".">0.78</td>
<td align="char" char=".">0.81</td>
<td align="char" char=".">0.99</td>
<td align="char" char=".">0.80</td>
<td align="char" char=".">0.81</td>
</tr>
<tr>
<td align="left">MTC</td>
<td align="char" char=".">0.81</td>
<td align="char" char=".">0.77</td>
<td align="char" char=".">0.84</td>
<td align="char" char=".">0.67</td>
<td align="char" char=".">0.71</td>
<td align="char" char=".">0.68</td>
<td align="char" char=".">0.99</td>
<td align="char" char=".">0.99</td>
<td align="char" char=".">0.99</td>
</tr>
<tr>
<td align="left">IRV-MTC</td>
<td align="char" char=".">0.82</td>
<td align="char" char=".">0.82</td>
<td align="char" char=".">0.85</td>
<td align="char" char=".">0.75</td>
<td align="char" char=".">0.78</td>
<td align="char" char=".">0.76</td>
<td align="char" char=".">0.88</td>
<td align="char" char=".">0.89</td>
<td align="char" char=".">0.90</td>
</tr>
<tr>
<td align="left">Robust MTC</td>
<td align="char" char=".">0.83</td>
<td align="char" char=".">0.80</td>
<td align="char" char=".">0.85</td>
<td align="char" char=".">0.71</td>
<td align="char" char=".">0.73</td>
<td align="char" char=".">0.71</td>
<td align="char" char=".">0.97</td>
<td align="char" char=".">0.96</td>
<td align="char" char=".">0.99</td>
</tr>
<tr>
<td align="left">XGBoost</td>
<td align="char" char=".">0.93</td>
<td align="char" char=".">0.84</td>
<td align="char" char=".">0.88</td>
<td align="char" char=".">0.85</td>
<td align="char" char=".">0.76</td>
<td align="char" char=".">0.80</td>
<td align="char" char=".">0.82</td>
<td align="char" char=".">0.73</td>
<td align="char" char=".">0.84</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-3">
<title>3.3 Optimization Led to Comparable Performances of all Models</title>
<p>Hyperparameters tuning of the selected models led to the identification of the combination of parameters that maximizes the model&#x2019;s ROC-AUC score. The detailed optimization results, the retained configurations for each model, and the corresponding performances in terms of ROC-AUC, accuracy, F1-score, MCC, and Cohen&#x2019;s Kappa coefficient were reported in <xref ref-type="sec" rid="s11">Supplementary Table S3</xref>. Learning rates, dropout, and the number of learned features appeared to be the most influential parameters on model performances. In fact, the optimal thresholds for the GCN model were a learning rate of 0.001 and a dropout of 0.1. For the DAG model, the optimal learning rate was 0.0005, and the number of learned features per atom in the graph was equal to 30. The optimal batch size and number of epochs for both models were 64 and 40, respectively (<xref ref-type="sec" rid="s11">Supplementary Table&#x20;S3</xref>).</p>
<p>Radar plots representing all computed scores for each model on the train and test sets were generated (<xref ref-type="fig" rid="F3">Figure&#x20;3</xref>). None of the algorithms presented an overfitting trend. They all exhibited round-shaped radar plots indicating no differential performance based on the different scoring metrics. Overall, the RF algorithm slightly outperformed both DL algorithms. All three models presented MCC values higher than 0.5, indicating their ability to provide a satisfying class prediction for anticoronavirus molecules (<xref ref-type="sec" rid="s11">Supplementary Table S3</xref>). The RF and DAG models exhibited Cohen&#x2019;s Kappa coefficient higher than 0.6, which indicates the substantial power of these algorithms in distinguishing both classes. The GCN model presented a coefficient value equal to 0.56, indicating a fair interrater&#x20;power.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Performances of the optimized models. <bold>(A)</bold> Radar plots of the models&#x2019; performances assessed on the train set <bold>(left)</bold> and the test set <bold>(right)</bold> through ROC-AUC, F1-score, Accuracy, Cohen&#x2019;s Kappa, MCC, and Recall. <bold>(B)</bold> The ROC curve of all three models. <bold>(C)</bold> The Precision-Recall (PR) curve of all three models.</p>
</caption>
<graphic xlink:href="fgene-12-744170-g003.tif"/>
</fig>
<p>The Receiver Operating Characteristic (ROC) curves exhibited smooth exponential-like shapes for all models, indicating satisfying classification power. The Precision-Recall (PR) curves also presented fair shapes for a balanced dataset (<xref ref-type="fig" rid="F3">Figure&#x20;3</xref>). At last, we performed a tenfold cross-validation. The average values of ROC-AUC, the F1-score, and the Recall over ten iterations were reported with the standard deviation values in <xref ref-type="table" rid="T2">Table&#x20;2</xref>. RF kept exhibiting the highest scores, although values were comparable across the three models. GCN achieved a higher ROC-AUC score as compared to the DAG model, an equivalent F1-score, and a lower Recall. Our results so far indicated that DL models kept achieving scores slightly lower than those of RF, despite being comparable.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Tenfold cross-validation results for the best classifiers. Scores are presented as mean values &#xb1; SD based on 10 iterations.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Model</th>
<th align="center">ROC-AUC</th>
<th align="center">F1-score</th>
<th align="center">Recall</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">RF</td>
<td align="char" char="plusmn">0.85&#x20;&#xb1; 0.026</td>
<td align="char" char="plusmn">0.78&#x20;&#xb1; 0.027</td>
<td align="char" char="plusmn">0.76&#x20;&#xb1; 0.032</td>
</tr>
<tr>
<td align="left">DAG model</td>
<td align="char" char="plusmn">0.79&#x20;&#xb1; 0.013</td>
<td align="char" char="plusmn">0.73&#x20;&#xb1; 0.052</td>
<td align="char" char="plusmn">0.74&#x20;&#xb1; 0.103</td>
</tr>
<tr>
<td align="left">GCN model</td>
<td align="char" char="plusmn">0.83&#x20;&#xb1; 0.026</td>
<td align="char" char="plusmn">0.73&#x20;&#xb1; 0.037</td>
<td align="char" char="plusmn">0.70&#x20;&#xb1; 0.082</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-4">
<title>3.4 GCN Model Demonstrated Noticeable Generalization Power</title>
<p>The last validation step was performed on the three optimized algorithms in order to assess their predictive power in identifying lead compounds against coronaviruses in general and SARS-CoV-2 in particular. Considering the heterogeneity of our dataset in terms of experiments and targets, it is important to assess the ability of the AI algorithms to generalize when tested on unseen datasets. To this end, we split our dataset into category-based subsets. Only categories 3CLpro_Cov and PLpro_Cov presented sufficient data points (<xref ref-type="sec" rid="s11">Supplementary Table S1</xref>) to be used for a stratified validation of the algorithms&#x2019; performances.</p>
<p>Homogeneous training denotes all experiments where models were trained and tested on one category subset. Heterogeneous training denotes all experiments where models were trained on the mixed dataset and tested on one category subset. Finally, we called mixed training the experiments where models were trained and tested on the dataset consisting of a mix of categories. Performances in terms of accuracy, F1-score, Recall/sensitivity, and specificity were reported in <xref ref-type="sec" rid="s11">Supplementary Table&#x20;S4</xref>.</p>
<p>Algorithms&#x2019; performances on the 3CLpro_cov category presented comparable values with the mixed training results. On the other hand, low Recall values were obtained with the PLpro_cov category trained on homogeneous and heterogeneous data (<xref ref-type="sec" rid="s11">Supplementary Figure S1</xref>). It is noteworthy to report that the 3CLpro_cov subset constitutes 41.6% of the mixed dataset and presents equivalent proportions between the &#x201c;active&#x201d; and &#x201c;inactive&#x201d; classes. This was not the case for the PLpro_cov subset, which constitutes 35.8% of the mixed dataset but presented nonequilibrated class distribution (71.0% of inactive molecules). This can explain the low Recall scores obtained for this particular category (<xref ref-type="sec" rid="s11">Supplementary Figure&#x20;S1</xref>).</p>
<p>Noticeably, RF and GCN models could achieve comparable Recall scores through the homogeneous and heterogeneous training experiments. This means that these algorithms exhibited a similar ability to correctly predict active molecules if trained either on the mixed dataset or on the subset of the 3CLpro_cov category and then tested on the 3CLpro_cov test set. In addition, the GCN scores were maintained close to those obtained on the mixed dataset and in comparison with cross-validation results (<xref ref-type="fig" rid="F4">Figure&#x20;4</xref>). This revealed a generalization power of this particular DL algorithm superior to the other models.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>ROC-AUC scores of the best classifiers tested on stratified subsets of the data (homogeneous, heterogeneous, and mixed).</p>
</caption>
<graphic xlink:href="fgene-12-744170-g004.tif"/>
</fig>
<p>In order to confirm such findings, we performed an external validation of the three algorithms&#x2019; ability to predict potential inhibitors targeting SARS-CoV-2 out of the FDA-approved drugs collection. We used a PubChem bioassay that consisted in a primary screen of 1,518&#x20;FDA-approved molecules against SARS-CoV-2-infected cells, out of which 17 molecules were retained as potentially active. Out of our mixed dataset, we removed all molecules included within this external validation set. We retrained all three models on our mixed dataset using its full content. Then, we predicted for all FDA-approved molecules from the validation set their activity class. We assessed the classification outcome in comparison with the experimental data and calculated the confusion matrix elements (TP, TN, FP, and FN) for each model under two scenarios (<xref ref-type="table" rid="T3">Table&#x20;3</xref>). First, we calculated the confusion matrix elements while comparing the predicted activity class without regard to the classification confidence (<xref ref-type="sec" rid="s11">Supplementary Table S5</xref>). Then, we applied a threshold of 80% confidence to select the molecules that would be prioritized by each algorithm. Examining this set of prioritized molecules shall assess the usefulness of our classifiers in providing a successful subselection of molecules for experimental validation.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>External validation of the three models&#x2019; performances in comparison with experimental results from the PubChem bioassay AID_1409594. Columns 2&#x2013;5 report TP, TN, FP, and FN counts based on the overall predictions of the algorithms. Columns 6&#x2013;9 report the TP, TN, FP, and FN counts based on the subselection of molecules with prediction confidence higher than&#x20;80%.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Activity criterion</th>
<th colspan="4" align="center">All molecules: no confidence threshold</th>
<th colspan="4" align="center">Subselection of molecules above the 80% confidence threshold</th>
</tr>
<tr>
<th align="center">TP</th>
<th align="center">TN</th>
<th align="center">FP</th>
<th align="center">FN</th>
<th align="center">TP</th>
<th align="center">TN</th>
<th align="center">FP</th>
<th align="center">FN</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">RF</td>
<td align="center">4</td>
<td align="center">490</td>
<td align="center">119</td>
<td align="center">13</td>
<td align="center">1</td>
<td align="center">425</td>
<td align="center">12</td>
<td align="center">8</td>
</tr>
<tr>
<td align="left">DAG</td>
<td align="center">7</td>
<td align="center">719</td>
<td align="center">340</td>
<td align="center">10</td>
<td align="center">3</td>
<td align="center">359</td>
<td align="center">99</td>
<td align="center">3</td>
</tr>
<tr>
<td align="left">GCN</td>
<td align="center">8</td>
<td align="center">877</td>
<td align="center">182</td>
<td align="center">9</td>
<td align="center">5</td>
<td align="center">835</td>
<td align="center">147</td>
<td align="center">9</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>For each algorithm, we first observed the TP and FN counts out of the 17 active molecules. Overall, the GCN model achieved the highest TP count of 8/17 and the lowest FN count of 9/17. The next best performer was the DAG model with TP counts of 7/17, while RF demonstrated the lowest TP count of 4/17 (<xref ref-type="table" rid="T3">Table&#x20;3</xref>). Interestingly, when considering the prioritized list of molecules using the 80% selection threshold, the GCN model achieved the best performances with most of the TP being within the priority list (5 out of 8). The same trend was observed for the TN count with 835/877 being correctly classified as inactive with confidence higher than 80%. Less satisfying rates were achieved by the DAG model (3/7 of TP and 3/10 of FN within the 80% confidence threshold selection) and RF (1/4 of TP and 8/13 of FN within the 80% confidence threshold selection). Thus, the GCN model demonstrated a higher ability to correctly classify both active and inactive molecules within the FDA-approved drugs collection.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<p>AI, precisely ML and DL, have now demonstrated high potential of delivering successful research outcomes in the field of DD (<xref ref-type="bibr" rid="B1">Achdout et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B19">Gupta et&#x20;al., 2021</xref>). The application of ML algorithms to cheminformatics and DD is heavily dependent on the rise of molecular encoding systems. The early descriptors consisted in a series of physicochemical properties of the molecules that rapidly demonstrated their limitations. Thus, chemical structure encoding appeared as a promising venue with the underlying hypothesis that the activity of a molecule is heavily correlated with its chemical structure (<xref ref-type="bibr" rid="B18">Gfeller et&#x20;al., 2014</xref>; <xref ref-type="bibr" rid="B50">Shi et&#x20;al., 2015</xref>; <xref ref-type="bibr" rid="B44">Perualila-Tan et&#x20;al., 2016</xref>). Multiple approaches dedicated to calculate molecular fingerprints were then proposed (<xref ref-type="bibr" rid="B4">Bero et&#x20;al., 2017</xref>). These consist in capturing topological and connectivity information within the molecule structure for an enhanced description as compared to simple physicochemical descriptors. Other groups proposed graph convolution-based algorithms that consider the molecule structure as an undirected graph where atoms are nodes and bonds are vertices. These methods were readily useful to implement DL architectures toward DD (<xref ref-type="bibr" rid="B69">Zitnik et&#x20;al., 2018</xref>; <xref ref-type="bibr" rid="B34">Li et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B68">Zhang et&#x20;al., 2019</xref>). Conventional ML methods such as RF, SVM, and simple NN demonstrated their ability to predict the inhibitory activity of molecules (<xref ref-type="bibr" rid="B20">Heikamp and Bajorath, 2014</xref>; <xref ref-type="bibr" rid="B6">Cano et&#x20;al., 2017</xref>) in the particular case where datasets are limited to a few hundred molecules. On the other hand, DL algorithms achieved interesting results on larger datasets (<xref ref-type="bibr" rid="B55">Unterthiner et&#x20;al., 2014</xref>; <xref ref-type="bibr" rid="B2">Aliper et&#x20;al., 2016</xref>; <xref ref-type="bibr" rid="B33">Lenselink et&#x20;al., 2017</xref>). This reflects the consistent dependency of DL algorithm performances on data size, although they are noticeably gaining ground, exhibiting as high performances as classic ML algorithms (<xref ref-type="bibr" rid="B19">Gupta et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B59">Walters and Barzilay, 2021</xref>). As DD is a low-data domain, adapted DL approaches were proposed such as one-shot (<xref ref-type="bibr" rid="B3">Altae-Tran et&#x20;al., 2017</xref>) and few-shot (<xref ref-type="bibr" rid="B35">Liu et&#x20;al., 2021</xref>) learning methods based on structure-activity relationships for activity predictions. Compared to more classical approaches, they demonstrated higher predictive power using a small number of positives in their training sets. However, they showed poor capability of generalization to distinct datasets.</p>
<p>In the present work, we assessed the performances of seven ML algorithms and four DL algorithms in predicting the activity of molecules against the COVID-19 viral agent. The training data is a unique collection of 2,610 data points integrated from different sources. It includes molecules presenting inhibiting actions against SARS-CoV and SARS-CoV-2 through multiple and heterogeneous experiments. Our results demonstrated the usefulness of such a dataset in building ML algorithms for activity prediction tasks toward DD against COVID-19. Best performing algorithms, namely, GCN and RF, demonstrated stable performances across different training/testing simulations on stratified subsets of the data. Through external validation on unseen data, the GCN model demonstrated the highest predictive power overall. The MoleculeNet authors performed an extensive benchmarking of multiple ML/DL algorithms, including those studied herein on different tasks and datasets (<xref ref-type="bibr" rid="B61">Wu et&#x20;al., 2018</xref>). RF and the GCN model were tested on multiple datasets (biophysics, physical chemistry, physiology, and quantum mechanics) and were often identified as the best performing algorithms within the conventional methods and the graph-based methods, respectively (<xref ref-type="bibr" rid="B61">Wu et&#x20;al., 2018</xref>). This is in line with our findings, although no direct comparison is possible due to the difference in the datasets used and the tasks on which performances were evaluated.</p>
<p>Data have always been a determinant factor in delivering robust ML. In the field of DD, it is a constant challenge to overcome. Many groups made considerable efforts in constituting dedicated datasets for DD (<xref ref-type="bibr" rid="B17">Gaulton et&#x20;al., 2012</xref>; <xref ref-type="bibr" rid="B63">Yang et&#x20;al., 2021</xref>). The interest in merging data from multiple sources (projects, experiments, etc.) was explored by other groups (<xref ref-type="bibr" rid="B12">Duran-Frigola et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B65">Zeng et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B22">Irwin et&#x20;al., 2020</xref>). Irwin et&#x20;al. demonstrated the ability of the Alchemite, a state-of-the-art DL algorithm, to outperform the RF-based QSAR model in property prediction (<xref ref-type="bibr" rid="B22">Irwin et&#x20;al., 2020</xref>). A recent work described a database called D3Similarity that contains 603 molecules with a validated activity against coronaviruses or human receptors (<xref ref-type="bibr" rid="B63">Yang et&#x20;al., 2021</xref>). The database has a web interface that allows for the screening of novel ligands to predict their potential to affect one of the main targets of SARS-CoV-2, namely, the 3CLpro and the PLpro. The activity prediction is performed through a direct assessment of the 2D or 3D similarity of a target molecule to the database elements. In this context, we have deployed important efforts in collecting and curating a dataset that can serve in training and validating different ML and DL approaches in tackling the search for therapeutics against SARS-CoV-2. Our dataset is larger than the D3Similarity dataset and yet ready for use in ML/DL applications against SARS-CoV-2. Conversely, it does not account for quantitative activity information.</p>
<p>It seems important to engage further efforts to integrate more information in our dataset toward its use for a quantitative prediction of molecules activity. Moreover, a deeper analysis of the dataset content may reveal important knowledge for DD projects. Further tuning of the dataset will aim to integrate valuable knowledge on what to expect from effective anti-SARS-CoV-2 molecules (<xref ref-type="bibr" rid="B54">Tummino et&#x20;al., 2021</xref>). In fact, it has been demonstrated that the cationic amphiphilic nature of some drugs may induce phospholipidosis rather than actual antiviral effects (<xref ref-type="bibr" rid="B54">Tummino et&#x20;al., 2021</xref>). Such properties should be further examined to enhance the relevance of our dataset to the development of COVID-19 therapeutics.</p>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>In the present study, we collected and curated a dedicated dataset of 2,610 molecules having anticoronavirus effects. This valuable resource was formatted and used to perform different simulations and optimization of eleven ML and DL algorithms toward the classification of molecules into active and inactive classes. We were able to obtain three highly accurate classifiers that were validated through cross-validation and on an external set of data. The DL algorithms demonstrated the best performances.</p>
</sec>
</body>
<back>
<sec id="s6">
<title>Data Availability Statement</title>
<p>The datasets presented in this study along with the jupyter notebooks can be found online on <ext-link ext-link-type="uri" xlink:href="https://github.com/Harigua/ML_DD-applications/tree/main/COVID-19">https://github.com/Harigua/ML_DD-applications/tree/main/COVID-19</ext-link>.</p>
</sec>
<sec id="s7">
<title>Author Contributions</title>
<p>EH-S conceived the research. EH-S and IA-T designed the experiments. EH-S, OS, and YA collected and curated the data. EH-S and MH implemented the code, tested the performances, and generated the figures. EH-S analyzed the results and drafted the original manuscript. EH-S, MH, IA-T, OS, and IG reviewed and edited the manuscript. All authors read and approved the final manuscript.</p>
</sec>
<sec id="s8">
<title>Funding</title>
<p>EH-S is a recipient of a NAS grant within the USAID PEER Women Mentoring Programme, Grant Award Number AID-OAA-A-11-00012. EH-S is also a recipient of the &#x201c;Cov2-Anti-Proteases&#x201d; project funded by Institut Pasteur-Paris, &#x201c;Lev&#xe9;e de Fond-Urgence COVID-19.&#x201d; EH-S and IG are financially supported by the programs of the Ministry of Higher education and Research of the Republic of Tunisia.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article or claim that may be made by its manufacturer is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s11">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fgene.2021.744170/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fgene.2021.744170/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material>
<label>Supplementary Figure&#x20;S1</label>
<caption>
<p>Performances of the three algorithms GCN, DAG and RF assessed on homogeneous vs. heterogeneous data. (a) Performances for the subsets of the category 3CLpro. (b) Performances for the subset of the category PLpro. (c) Comparison between the models performances on the mixed dataset and the 3CLpro subset.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Table&#x20;S1</label>
<caption>
<p>Statistics on the anticoronavirus dataset according to their origin (bioassays vs. literature) and their type (experiments, targets, etc). Composition of the training, validation and test set in terms of active and inactive molecules are indicated.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Table&#x20;S2</label>
<caption>
<p>Performances of all 11 algorithms using different splitting ratios (80/10/10 vs. 60/202/20) and methods (random vs. scaffold split) on different datasets (heterogenoeous vs. homogeneous).</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Table&#x20;S3</label>
<caption>
<p>Hyperparameters&#x2019; tuning and optimization for the three best performers: GCN, DAG and RF.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Table&#x20;S4</label>
<caption>
<p>Performances of the optimized algorithms GCN, DAG and RF in terms of accuracy, F1-score, sensitivity and specificity, on different subsets of the dataset.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Table&#x20;S5</label>
<caption>
<p>Predictions assessment of the three algorithms GCN, DAG and RF on the subset of experimentally validated molecules (sheet 1). Sheets 2-4 contain the prediction outcomes for each algorithm for all molecules.</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="Table2.XLSX" id="SM1" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table3.XLSX" id="SM2" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table4.XLSX" id="SM3" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table1.XLSX" id="SM4" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image1.PNG" id="SM5" mimetype="application/PNG" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Table5.XLSX" id="SM6" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Achdout</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Aimon</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bar-David</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Barr</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ben-Shmuel</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bennett</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <source>Covid Moonshot: Open Science Discovery of Sars-Cov-2 Main Protease Inhibitors by Combining Crowdsourcing, High-Throughput Experiments, Computational Simulations, and Machine Learning</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>BioRxiv</publisher-name>. </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Aliper</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Plis</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Artemov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ulloa</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mamoshina</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Zhavoronkov</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Deep Learning Applications for Predicting Pharmacological Properties of Drugs and Drug Repurposing Using Transcriptomic Data</article-title>. <source>Mol. Pharmaceutics</source> <volume>13</volume>, <fpage>2524</fpage>&#x2013;<lpage>2530</lpage>. <pub-id pub-id-type="doi">10.1021/acs.molpharmaceut.6b00248</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Altae-Tran</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ramsundar</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Pappu</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Pande</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Low Data Drug Discovery with One-Shot Learning</article-title>. <source>ACS Cent. Sci.</source> <volume>3</volume>, <fpage>283</fpage>&#x2013;<lpage>293</lpage>. <pub-id pub-id-type="doi">10.1021/acscentsci.6b00367</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bero</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Muda</surname>
<given-names>A. K.</given-names>
</name>
<name>
<surname>Choo</surname>
<given-names>Y. H.</given-names>
</name>
<name>
<surname>Muda</surname>
<given-names>N. A.</given-names>
</name>
<name>
<surname>Pratama</surname>
<given-names>S. F.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Similarity Measure for Molecular Structure: a Brief Review</article-title>. <source>J.&#x20;Phys. Conf. Ser.</source> <volume>892</volume>, <fpage>012015</fpage>. <pub-id pub-id-type="doi">10.1088/1742-6596/892/1/012015</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bung</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Krishnan</surname>
<given-names>S. R.</given-names>
</name>
<name>
<surname>Bulusu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Roy</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>De Novo design of New Chemical Entities for Sars-Cov-2 Using Artificial Intelligence</article-title>. <source>Future Med. Chem.</source> <volume>13</volume>, <fpage>575</fpage>&#x2013;<lpage>585</lpage>. <pub-id pub-id-type="doi">10.4155/fmc-2020-0262</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cano</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Garcia-Rodriguez</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Garcia-Garcia</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Perez-Sanchez</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Benediktsson</surname>
<given-names>J.&#x20;A.</given-names>
</name>
<name>
<surname>Thapa</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Automatic Selection of Molecular Descriptors Using Random forest: Application to Drug Discovery</article-title>. <source>Expert Syst. Appl.</source> <volume>72</volume>, <fpage>151</fpage>&#x2013;<lpage>159</lpage>. <pub-id pub-id-type="doi">10.1016/j.eswa.2016.12.008</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chellapandi</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Saranya</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Genomics Insights of Sars-Cov-2 (Covid-19) into Target-Based Drug Discovery</article-title>. <source>Med. Chem. Res.</source> <volume>31</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1007/s00044-020-02610-8</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chung</surname>
<given-names>N. C.</given-names>
</name>
<name>
<surname>Miasojedow</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Startek</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Gambin</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Jaccard/tanimoto Similarity Test and Estimation Methods for Biological Presence-Absence Data</article-title>. <source>BMC bioinformatics</source> <volume>20</volume>, <fpage>644</fpage>&#x2013;<lpage>711</lpage>. <pub-id pub-id-type="doi">10.1186/s12859-019-3118-5</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>David</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Thakkar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mercado</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Engkvist</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Molecular Representations in Ai-Driven Drug Discovery: a Review and Practical Guide</article-title>. <source>J.&#x20;Cheminform</source>. <volume>12</volume>, <fpage>56</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1186/s13321-020-00460-5</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Dietterich</surname>
<given-names>T. G.</given-names>
</name>
</person-group> (<year>2000</year>). &#x201c;<article-title>Ensemble Methods in Machine Learning</article-title>,&#x201d; in <conf-name>International Workshop on Multiple Classifier Systems</conf-name> (<publisher-name>Springer</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1007/3-540-45014-9_1</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dragojevic Simic</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Miljkovic</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Stamenkovic</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Vekic</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Ratkovic</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Simic</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>An Overview of Antiviral Strategies for Coronavirus 2 (Sars-cov-2) Infection with Special Reference to Antimalarial Drugs Chloroquine and Hydroxychloroquine</article-title>. <source>Int. J.&#x20;Clin. Pract.</source> <volume>75</volume>, <fpage>e13825</fpage>. <pub-id pub-id-type="doi">10.1111/ijcp.13825</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Duran-Frigola</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Fern&#xe1;ndez-Torras</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bertoni</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Aloy</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Formatting Biological Big Data for Modern Machine Learning in Drug Discovery</article-title>. <source>Wiley Interdiscip. Rev. Comput. Mol. Sci.</source> <volume>9</volume>, <fpage>e1408</fpage>. <pub-id pub-id-type="doi">10.1002/wcms.1408</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Duvenaud</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Maclaurin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Aguilera-Iparraguirre</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>G&#xf3;mez-Bombarelli</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Hirzel</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Aspuru-Guzik</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <source>Convolutional Networks on Graphs for Learning Molecular Fingerprints</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>arXiv preprint arXiv:1509.09292</publisher-name>. </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ericksen</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Michael</surname>
<given-names>L. A.</given-names>
</name>
<name>
<surname>Newton</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Hoffmann</surname>
<given-names>F. M.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Machine Learning Consensus Scoring Improves Performance across Targets in Structure-Based Virtual Screening</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>57</volume>, <fpage>1579</fpage>&#x2013;<lpage>1590</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.7b00153</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Filippov</surname>
<given-names>I. V.</given-names>
</name>
<name>
<surname>Nicklaus</surname>
<given-names>M. C.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Optical Structure Recognition Software to Recover Chemical Information: OSRA, an Open Source Solution</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>49</volume>, <fpage>740</fpage>&#x2013;<lpage>743</lpage>. <pub-id pub-id-type="doi">10.1021/ci800067r</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Galan</surname>
<given-names>L. E. B.</given-names>
</name>
<name>
<surname>Santos</surname>
<given-names>N. M. d.</given-names>
</name>
<name>
<surname>Asato</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Ara&#xfa;jo</surname>
<given-names>J.&#x20;V.</given-names>
</name>
<name>
<surname>de Lima Moreira</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ara&#xfa;jo</surname>
<given-names>A. M. M.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Phase 2 Randomized Study on Chloroquine, Hydroxychloroquine or Ivermectin in Hospitalized Patients with Severe Manifestations of Sars-Cov-2 Infection</article-title>. <source>Pathog. Glob. Health</source> <volume>115</volume>, <fpage>235</fpage>&#x2013;<lpage>242</lpage>. <pub-id pub-id-type="doi">10.1080/20477724.2021.1890887</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gaulton</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bellis</surname>
<given-names>L. J.</given-names>
</name>
<name>
<surname>Bento</surname>
<given-names>A. P.</given-names>
</name>
<name>
<surname>Chambers</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Davies</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hersey</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>Chembl: a Large-Scale Bioactivity Database for Drug Discovery</article-title>. <source>Nucleic Acids Res.</source> <volume>40</volume>, <fpage>D1100</fpage>&#x2013;<lpage>D1107</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkr777</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gfeller</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Grosdidier</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wirth</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Daina</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Michielin</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Zoete</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Swisstargetprediction: a Web Server for Target Prediction of Bioactive Small Molecules</article-title>. <source>Nucleic Acids Res.</source> <volume>42</volume>, <fpage>W32</fpage>&#x2013;<lpage>W38</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gku293</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gupta</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Srivastava</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Sahu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Tiwari</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ambasta</surname>
<given-names>R. K.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Artificial Intelligence to Deep Learning: Machine Intelligence Approach for Drug Discovery</article-title>. <source>Mol. Divers.</source> <volume>25</volume>, <fpage>1</fpage>&#x2013;<lpage>46</lpage>. <pub-id pub-id-type="doi">10.1007/s11030-021-10217-3</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Heikamp</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Bajorath</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Support Vector Machines for Drug Discovery</article-title>. <source>Expert Opin. Drug Discov.</source> <volume>9</volume>, <fpage>93</fpage>&#x2013;<lpage>104</lpage>. <pub-id pub-id-type="doi">10.1517/17460441.2014.866943</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hoffmann</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>M&#xf6;sbauer</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hofmann-Winkler</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kaul</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kleine-Weber</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kr&#xfc;ger</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Chloroquine Does Not Inhibit Infection of Human Lung Cells with Sars-Cov-2</article-title>. <source>Nature</source> <volume>585</volume>, <fpage>588</fpage>&#x2013;<lpage>590</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-020-2575-3</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Irwin</surname>
<given-names>B. W. J.</given-names>
</name>
<name>
<surname>Levell</surname>
<given-names>J.&#x20;R.</given-names>
</name>
<name>
<surname>Whitehead</surname>
<given-names>T. M.</given-names>
</name>
<name>
<surname>Segall</surname>
<given-names>M. D.</given-names>
</name>
<name>
<surname>Conduit</surname>
<given-names>G. J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Practical Applications of Deep Learning to Impute Heterogeneous Drug Discovery Data</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>60</volume>, <fpage>2848</fpage>&#x2013;<lpage>2857</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.0c00443</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jing</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Bian</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>X. Q.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Deep Learning for Drug Design: an Artificial Intelligence Paradigm for Drug Discovery in the Big Data Era</article-title>. <source>AAPS J.</source> <volume>20</volume>, <fpage>58</fpage>&#x2013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1208/s12248-018-0210-0</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kearnes</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>McCloskey</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Berndl</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Pande</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Riley</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Molecular Graph Convolutions: Moving beyond Fingerprints</article-title>. <source>J.&#x20;Comput. Aided Mol. Des.</source> <volume>30</volume>, <fpage>595</fpage>&#x2013;<lpage>608</lpage>. <pub-id pub-id-type="doi">10.1007/s10822-016-9938-8</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kelleni</surname>
<given-names>M. T.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Tocilizumab, Remdesivir, Favipiravir, and Dexamethasone Repurposed for Covid-19: A Comprehensive Clinical and Pharmacovigilant Reassessment</article-title>. <source>SN Compr. Clin. Med.</source> <volume>3</volume>, <fpage>919</fpage>&#x2013;<lpage>923</lpage>. <pub-id pub-id-type="doi">10.1007/s42399-021-00824-4</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Keshavarzi Arshadi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Webb</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Salem</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Cruz</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Calad-Thomson</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ghadirian</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Artificial Intelligence for Covid-19 Drug Discovery and Vaccine Development</article-title>. <source>Front. Artif. Intell.</source> <volume>3</volume>, <fpage>65</fpage>. <pub-id pub-id-type="doi">10.3389/frai.2020.00065</pub-id> </citation>
</ref>
<ref id="B70">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gindulyte</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Q.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>PubChem in 2021: New Data Content and Improved Web Interfaces</article-title>. <source>Nucleic Acids Res.</source> <volume>49</volume>, <fpage>D1388</fpage>&#x2013;<lpage>D1395</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkaa971</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kipf</surname>
<given-names>T. N.</given-names>
</name>
<name>
<surname>Welling</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2016</year>). <source>Semi-supervised Classification with Graph Convolutional Networks</source>. <publisher-loc>New York</publisher-loc>: <publisher-name>arXiv preprint arXiv:1609.02907</publisher-name>. </citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Korkmaz</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deep Learning-Based Imbalanced Data Classification for Drug Discovery</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>60</volume>, <fpage>4180</fpage>&#x2013;<lpage>4190</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.9b01162</pub-id> </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lavecchia</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Deep Learning in Drug Discovery: Opportunities, Challenges and Future Prospects</article-title>. <source>Drug Discov. Today</source> <volume>24</volume>, <fpage>2017</fpage>&#x2013;<lpage>2032</lpage>. <pub-id pub-id-type="doi">10.1016/j.drudis.2019.07.006</pub-id> </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Le</surname>
<given-names>N. Q. K.</given-names>
</name>
<name>
<surname>Huynh</surname>
<given-names>T.-T.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Identifying Snares by Incorporating Deep Learning Architecture and Amino Acid Embedding Representation</article-title>. <source>Front. Physiol.</source> <volume>10</volume>, <fpage>1501</fpage>. <pub-id pub-id-type="doi">10.3389/fphys.2019.01501</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Le</surname>
<given-names>N. Q. K.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>V.-N.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Snare-cnn: a 2d Convolutional Neural Network Architecture to Identify Snare Proteins from High-Throughput Sequencing Data</article-title>. <source>PeerJ&#x20;Comp. Sci.</source> <volume>5</volume>, <fpage>e177</fpage>. <pub-id pub-id-type="doi">10.7717/peerj-cs.177</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Le</surname>
<given-names>N. Q. K.</given-names>
</name>
<name>
<surname>Yapp</surname>
<given-names>E. K. Y.</given-names>
</name>
<name>
<surname>Nagasundaram</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Yeh</surname>
<given-names>H.-Y.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Classifying Promoters by Interpreting the Hidden Information of Dna Sequences <italic>via</italic> Deep Learning and Combination of Continuous Fasttext N-Grams</article-title>. <source>Front. Bioeng. Biotechnol.</source> <volume>7</volume>, <fpage>305</fpage>. <pub-id pub-id-type="doi">10.3389/fbioe.2019.00305</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lenselink</surname>
<given-names>E. B.</given-names>
</name>
<name>
<surname>Ten Dijke</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Bongers</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Papadatos</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Van Vlijmen</surname>
<given-names>H. W. T.</given-names>
</name>
<name>
<surname>Kowalczyk</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Beyond the Hype: Deep Neural Networks Outperform Established Methods Using a Chembl Bioactivity Benchmark Set</article-title>. <source>J.&#x20;Cheminform</source>. <volume>9</volume>, <fpage>45</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1186/s13321-017-0232-0</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Deepchemstable: Chemical Stability Prediction with an Attention-Based Graph Convolution Network</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>59</volume>, <fpage>1044</fpage>&#x2013;<lpage>1049</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.8b00672</pub-id> </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Covid-19 Multi-Targeted Drug Repurposing Using Few-Shot Learning</article-title>. <source>Front. Bioinformatics</source> <volume>1</volume>, <fpage>18</fpage>. <pub-id pub-id-type="doi">10.3389/fbinf.2021.693177</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lo</surname>
<given-names>Y.-C.</given-names>
</name>
<name>
<surname>Rensi</surname>
<given-names>S. E.</given-names>
</name>
<name>
<surname>Torng</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Altman</surname>
<given-names>R. B.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Machine Learning in Chemoinformatics and Drug Discovery</article-title>. <source>Drug Discov. Today</source> <volume>23</volume>, <fpage>1538</fpage>&#x2013;<lpage>1546</lpage>. <pub-id pub-id-type="doi">10.1016/j.drudis.2018.05.010</pub-id> </citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lusci</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pollastri</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Baldi</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Deep Architectures and Deep Learning in Chemoinformatics: the Prediction of Aqueous Solubility for Drug-like Molecules</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>53</volume>, <fpage>1563</fpage>&#x2013;<lpage>1575</lpage>. <pub-id pub-id-type="doi">10.1021/ci400187y</pub-id> </citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Matthews</surname>
<given-names>B. W.</given-names>
</name>
</person-group> (<year>1975</year>). <article-title>Comparison of the Predicted and Observed Secondary Structure of T4 Phage Lysozyme</article-title>. <source>Biochim. Biophys. Acta (Bba) - Protein Struct.</source> <volume>405</volume>, <fpage>442</fpage>&#x2013;<lpage>451</lpage>. <pub-id pub-id-type="doi">10.1016/0005-2795(75)90109-9</pub-id> </citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Micheli</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Neural Network for Graphs: A Contextual Constructive Approach</article-title>. <source>IEEE Trans. Neural Netw.</source> <volume>20</volume>, <fpage>498</fpage>&#x2013;<lpage>511</lpage>. <pub-id pub-id-type="doi">10.1109/tnn.2008.2010350</pub-id> </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Moiseev</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Avdeev</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Brovko</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Novikov</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Fomin</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Is There a Future for Hydroxychloroquine/chloroquine in Prevention of Sars-Cov-2 Infection (Covid-19)?</article-title> <source>Ann. Rheum. Dis.</source> <volume>80</volume>, <fpage>e19</fpage>. <pub-id pub-id-type="doi">10.1136/annrheumdis-2020-217570</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Pastick</surname>
<given-names>K. A.</given-names>
</name>
<name>
<surname>Okafor</surname>
<given-names>E. C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Lofgren</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Skipper</surname>
<given-names>C. P.</given-names>
</name>
<name>
<surname>Nicol</surname>
<given-names>M. R.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>Hydroxychloroquine and Chloroquine for Treatment of Sars-Cov-2 (Covid-19)</article-title>,&#x201d; in <source>Open Forum Infectious Diseases</source> (<publisher-name>Oxford University Press US</publisher-name>), <fpage>ofaa130</fpage>. </citation>
</ref>
<ref id="B42">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Paul</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Sanap</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Shenoy</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kalyane</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kalia</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Tekade</surname>
<given-names>R. K.</given-names>
</name>
</person-group> (<year>2020</year>). <source>Artificial Intelligence in Drug Discovery and Development</source>. <publisher-loc>Amesterdam</publisher-loc>: <publisher-name>Drug Discovery Today</publisher-name>. </citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pedregosa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Varoquaux</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gramfort</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Michel</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Thirion</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Grisel</surname>
<given-names>O.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Scikit-learn: Machine Learning in python</article-title>. <source>J.&#x20;machine Learn. Res.</source> <volume>12</volume>, <fpage>2825</fpage>&#x2013;<lpage>2830</lpage>. </citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Perualila-Tan</surname>
<given-names>N. J.</given-names>
</name>
<name>
<surname>Shkedy</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Talloen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>G&#xf6;hlmann</surname>
<given-names>H. W. H.</given-names>
</name>
<name>
<surname>Moerbeke</surname>
<given-names>M. V.</given-names>
</name>
<name>
<surname>Kasim</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Weighted Similarity-Based Clustering of Chemical Structures and Bioactivity Data in Early Drug Discovery</article-title>. <source>J.&#x20;Bioinform. Comput. Biol.</source> <volume>14</volume>, <fpage>1650018</fpage>. <pub-id pub-id-type="doi">10.1142/s0219720016500189</pub-id> </citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pillaiyar</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Meenakshisundaram</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Manickam</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Recent Discovery and Development of Inhibitors Targeting Coronaviruses</article-title>. <source>Drug Discov. Today</source> <volume>25</volume>, <fpage>668</fpage>&#x2013;<lpage>688</lpage>. <pub-id pub-id-type="doi">10.1016/j.drudis.2020.01.015</pub-id> </citation>
</ref>
<ref id="B46">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ramsundar</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Eastman</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Walters</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pande</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2019</year>). <source>Deep Learning for the Life Sciences: Applying Deep Learning to Genomics, Microscopy, Drug Discovery, and More</source>. <publisher-loc>Sebastopol, CA</publisher-loc>: <publisher-name>&#x201c;O&#x2019;Reilly Media, Inc.&#x201d;</publisher-name>. </citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ramsundar</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Verras</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Tudor</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sheridan</surname>
<given-names>R. P.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Is Multitask Deep Learning Practical for Pharma?</article-title> <source>J.&#x20;Chem. Inf. Model.</source> <volume>57</volume>, <fpage>2068</fpage>&#x2013;<lpage>2076</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.7b00146</pub-id> </citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rifaioglu</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Atas</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Martin</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Cetin-Atalay</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Atalay</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Do&#x11f;an</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Recent Applications of Deep Learning and Machine Intelligence on In Silico Drug Discovery: Methods, Tools and Databases</article-title>. <source>Brief. Bioinformatics</source> <volume>20</volume>, <fpage>1878</fpage>&#x2013;<lpage>1912</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bby061</pub-id> </citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rogers</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Hahn</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Extended-connectivity Fingerprints</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>50</volume>, <fpage>742</fpage>&#x2013;<lpage>754</lpage>. <pub-id pub-id-type="doi">10.1021/ci100050t</pub-id> </citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shi</surname>
<given-names>J.-Y.</given-names>
</name>
<name>
<surname>Yiu</surname>
<given-names>S.-M.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Leung</surname>
<given-names>H. C. M.</given-names>
</name>
<name>
<surname>Chin</surname>
<given-names>F. Y. L.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Predicting Drug-Target Interaction for New Drugs Using Enhanced Similarity Measures and Super-target Clustering</article-title>. <source>Methods</source> <volume>83</volume>, <fpage>98</fpage>&#x2013;<lpage>104</lpage>. <pub-id pub-id-type="doi">10.1016/j.ymeth.2015.04.036</pub-id> </citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Song</surname>
<given-names>L. G.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>Q. X.</given-names>
</name>
<name>
<surname>Lao</surname>
<given-names>H. L.</given-names>
</name>
<name>
<surname>Lv</surname>
<given-names>Z. Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Human Coronaviruses and Therapeutic Drug Discovery</article-title>. <source>Infect. Dis. Poverty</source> <volume>10</volume>, <fpage>28</fpage>&#x2013;<lpage>21</lpage>. <pub-id pub-id-type="doi">10.1186/s40249-021-00812-9</pub-id> </citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Multi-stage Self-Supervised Learning for Graph Convolutional Networks on Graphs with Few Labeled Nodes</article-title>. <source>Aaai</source> <volume>34</volume>, <fpage>5892</fpage>&#x2013;<lpage>5899</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v34i04.6048</pub-id> </citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Trezza</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Iovinelli</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Santucci</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Prischi</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Spiga</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>An Integrated Drug Repurposing Strategy for the Rapid Identification of Potential Sars-Cov-2 Viral Inhibitors</article-title>. <source>Sci. Rep.</source> <volume>10</volume>, <fpage>13866</fpage>&#x2013;<lpage>13868</lpage>. <pub-id pub-id-type="doi">10.1038/s41598-020-70863-9</pub-id> </citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tummino</surname>
<given-names>T. A.</given-names>
</name>
<name>
<surname>Rezelj</surname>
<given-names>V. V.</given-names>
</name>
<name>
<surname>Fischer</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Fischer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>O&#x2019;Meara</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Monel</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Drug-induced Phospholipidosis Confounds Drug Repurposing for Sars-Cov-2</article-title>. <source>Science</source> <volume>373</volume>, <fpage>1</fpage>. <pub-id pub-id-type="doi">10.1126/science.abi4708</pub-id> </citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Unterthiner</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Mayr</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Klambauer</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Steijaert</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wegner</surname>
<given-names>J.&#x20;K.</given-names>
</name>
<name>
<surname>Ceulemans</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Deep Learning as an Opportunity in Virtual Screening</article-title>. <source>Proc. deep Learn. Workshop NIPS</source> <volume>27</volume>, <fpage>1</fpage>&#x2013;<lpage>9</lpage>. </citation>
</ref>
<ref id="B56">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vamathevan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Clark</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Czodrowski</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Dunham</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Ferran</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Applications of Machine Learning in Drug Discovery and Development</article-title>. <source>Nat. Rev. Drug Discov.</source> <volume>18</volume>, <fpage>463</fpage>&#x2013;<lpage>477</lpage>. <pub-id pub-id-type="doi">10.1038/s41573-019-0024-5</pub-id> </citation>
</ref>
<ref id="B57">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Velickovic</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Cucurull</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Casanova</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Romero</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lio</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Graph Attention Networks</article-title>. <source>Stat</source> <volume>1050</volume>, <fpage>4</fpage>. </citation>
</ref>
<ref id="B58">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vincent</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Bergeron</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Benjannet</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Erickson</surname>
<given-names>B. R.</given-names>
</name>
<name>
<surname>Rollin</surname>
<given-names>P. E.</given-names>
</name>
<name>
<surname>Ksiazek</surname>
<given-names>T. G.</given-names>
</name>
<etal/>
</person-group> (<year>2005</year>). <article-title>Chloroquine Is a Potent Inhibitor of Sars Coronavirus Infection and Spread</article-title>. <source>Virol. J.</source> <volume>2</volume>, <fpage>69</fpage>&#x2013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1186/1743-422X-2-69</pub-id> </citation>
</ref>
<ref id="B59">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Walters</surname>
<given-names>W. P.</given-names>
</name>
<name>
<surname>Barzilay</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Critical Assessment of Ai in Drug Discovery</article-title>. <source>Expert Opin. Drug Discov.</source> <volume>16</volume>, <fpage>1</fpage>&#x2013;<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1080/17460441.2021.1915982</pub-id> </citation>
</ref>
<ref id="B60">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Long</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>P. S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A Comprehensive Survey on Graph Neural Networks</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>32</volume>, <fpage>4</fpage>&#x2013;<lpage>24</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2020.2978386</pub-id> </citation>
</ref>
<ref id="B61">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ramsundar</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Feinberg</surname>
<given-names>E. N.</given-names>
</name>
<name>
<surname>Gomes</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Geniesse</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Pappu</surname>
<given-names>A. S.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Moleculenet: a Benchmark for Molecular Machine Learning</article-title>. <source>Chem. Sci.</source> <volume>9</volume>, <fpage>513</fpage>&#x2013;<lpage>530</lpage>. <pub-id pub-id-type="doi">10.1039/c7sc02664a</pub-id> </citation>
</ref>
<ref id="B62">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Predicting or Pretending: Artificial Intelligence for Protein-Ligand Interactions Lack of Sufficiently Large and Unbiased Datasets</article-title>. <source>Front. Pharmacol.</source> <volume>11</volume>, <fpage>69</fpage>. <pub-id pub-id-type="doi">10.3389/fphar.2020.00069</pub-id> </citation>
</ref>
<ref id="B63">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Mu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Ligand-based Approach for Predicting Drug Targets and for Virtual Screening against Covid-19</article-title>. <source>Brief. Bioinform.</source> <volume>22</volume>, <fpage>1053</fpage>&#x2013;<lpage>1064</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbaa422</pub-id> </citation>
</ref>
<ref id="B64">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yao</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Niu</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>
<italic>In Vitro</italic> antiviral Activity and Projection of Optimized Dosing Design of Hydroxychloroquine for the Treatment of Severe Acute Respiratory Syndrome Coronavirus 2 (Sars-cov-2)</article-title>. <source>Clin. Infect. Dis.</source> <volume>71</volume>, <fpage>732</fpage>&#x2013;<lpage>739</lpage>. <pub-id pub-id-type="doi">10.1093/cid/ciaa237</pub-id> </citation>
</ref>
<ref id="B65">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zeng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Nussinov</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Deepdr: a Network-Based Deep Learning Approach to In Silico Drug Repositioning</article-title>. <source>Bioinformatics</source> <volume>35</volume>, <fpage>5191</fpage>&#x2013;<lpage>5198</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btz418</pub-id> </citation>
</ref>
<ref id="B66">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhai</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Haider</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kraut</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>An Integrated Computational and Experimental Approach to Identifying Inhibitors for Sars-Cov-2 3cl Protease</article-title>. <source>Front. Mol. Biosciences</source> <volume>8</volume>, <fpage>267</fpage>. <pub-id pub-id-type="doi">10.3389/fmolb.2021.661424</pub-id> </citation>
</ref>
<ref id="B67">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Saravanan</surname>
<given-names>K. M.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hossain</surname>
<given-names>M. T.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Deep Learning Based Drug Screening for Novel Coronavirus 2019-ncov</article-title>. <source>Interdiscip. Sci. Comput. Life Sci.</source> <volume>12</volume>, <fpage>368</fpage>&#x2013;<lpage>376</lpage>. <pub-id pub-id-type="doi">10.1007/s12539-020-00376-6</pub-id> </citation>
</ref>
<ref id="B68">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Tong</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Maciejewski</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Graph Convolutional Networks: a Comprehensive Review</article-title>. <source>Comput. Soc. Networks</source> <volume>6</volume>, <fpage>1</fpage>&#x2013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1186/s40649-019-0069-y</pub-id> </citation>
</ref>
<ref id="B69">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zitnik</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Agrawal</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Leskovec</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Modeling Polypharmacy Side Effects with Graph Convolutional Networks</article-title>. <source>Bioinformatics</source> <volume>34</volume>, <fpage>i457</fpage>&#x2013;<lpage>i466</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bty294</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>