<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Chem.</journal-id>
<journal-title>Frontiers in Chemistry</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Chem.</abbrev-journal-title>
<issn pub-type="epub">2296-2646</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">866585</article-id>
<article-id pub-id-type="doi">10.3389/fchem.2022.866585</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Chemistry</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Improving Small Molecule pK<sub>
<italic>a</italic>
</sub> Prediction Using Transfer Learning With Graph Neural Networks</article-title>
<alt-title alt-title-type="left-running-head">Mayr et al.</alt-title>
<alt-title alt-title-type="right-running-head">pKa Prediction With GNNs</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Mayr</surname>
<given-names>Fritz</given-names>
</name>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<xref ref-type="fn" rid="fn2">
<sup>&#x2021;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1664572/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Wieder</surname>
<given-names>Marcus</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<xref ref-type="fn" rid="fn2">
<sup>&#x2021;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/687941/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wieder</surname>
<given-names>Oliver</given-names>
</name>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1119729/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Langer</surname>
<given-names>Thierry</given-names>
</name>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
</contrib>
</contrib-group>
<aff>
<institution>Department of Pharmaceutical Sciences</institution>, <institution>Pharmaceutical Chemistry Division</institution>, <institution>University of Vienna</institution>, <addr-line>Vienna</addr-line>, <country>Austria</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/382615/overview">Marco Tutone</ext-link>, University of Palermo, Italy</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/759784/overview">Jean-Louis Reymond</ext-link>, University of Bern, Switzerland</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/775834/overview">Kun Yao</ext-link>, Schrodinger, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Marcus Wieder, <email>marcus.wieder@gmail.com</email>
</corresp>
<fn fn-type="other" id="fn1">
<label>
<sup>&#x2020;</sup>
</label>
<p>ORCID: Fritz Mayr, <ext-link ext-link-type="uri" xlink:href="http://orcid.org/0000-0002-6621-2108">orcid.org/0000-0002-6621-2108</ext-link>; Marcus Wieder, <ext-link ext-link-type="uri" xlink:href="http://orcid.org/0000-0003-2631-8415">orcid.org/0000-0003-2631-8415</ext-link>; Oliver Wieder, <ext-link ext-link-type="uri" xlink:href="http://orcid.org/0000-0003-4967-7613">orcid.org/0000-0003-4967-7613</ext-link>; Thierry Langer, <ext-link ext-link-type="uri" xlink:href="http://orcid.org/0000-0002-5242-1240">orcid.org/0000-0002-5242-1240</ext-link>
</p>
</fn>
<fn fn-type="equal" id="fn2">
<label>
<sup>&#x2021;</sup>
</label>
<p>These authors have contributed equally to this work</p>
</fn>
<fn fn-type="other">
<p>This article was submitted to Theoretical and Computational Chemistry, a section of the journal Frontiers in Chemistry</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>26</day>
<month>05</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>10</volume>
<elocation-id>866585</elocation-id>
<history>
<date date-type="received">
<day>31</day>
<month>01</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>04</day>
<month>04</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Mayr, Wieder, Wieder and Langer.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Mayr, Wieder, Wieder and Langer</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Enumerating protonation states and calculating microstate pK<sub>
<italic>a</italic>
</sub> values of small molecules is an important yet challenging task for lead optimization and molecular modeling. Commercial and non-commercial solutions have notable limitations such as restrictive and expensive licenses, high CPU/GPU hour requirements, or the need for expert knowledge to set up and use. We present a graph neural network model that is trained on 714,906 calculated microstate pK<sub>
<italic>a</italic>
</sub> predictions from molecules obtained from the ChEMBL database. The model is fine-tuned on a set of 5,994 experimental pK<sub>
<italic>a</italic>
</sub> values significantly improving its performance on two challenging test sets. Combining the graph neural network model with Dimorphite-DL, an open-source program for enumerating ionization states, we have developed the open-source Python package pkasolver, which is able to generate and enumerate protonation states and calculate pK<sub>
<italic>a</italic>
</sub> values with high accuracy.</p>
</abstract>
<kwd-group>
<kwd>physical properties</kwd>
<kwd>PKA</kwd>
<kwd>Graph Neural Network (GNN)</kwd>
<kwd>transfer learning</kwd>
<kwd>protonation states</kwd>
</kwd-group>
<contract-num rid="cn001">J 4245-N28</contract-num>
<contract-num rid="cn002">IMI2 JU 821528</contract-num>
<contract-sponsor id="cn001">Austrian Science Fund<named-content content-type="fundref-id">10.13039/501100002428</named-content>
</contract-sponsor>
<contract-sponsor id="cn002">Innovative Medicines Initiative<named-content content-type="fundref-id">10.13039/501100010767</named-content>
</contract-sponsor>
<contract-sponsor id="cn003">Horizon 2020 Framework Programme<named-content content-type="fundref-id">10.13039/100010661</named-content>
</contract-sponsor>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>The acid dissociation constant (K<sub>
<italic>a</italic>
</sub>), most often written as its negative logarithm (pK<sub>
<italic>a</italic>
</sub>), plays a significant role in molecular modeling, as it influences the charge, tautomer configuration, and overall 3D structure of molecules with accessible protonation states in the physiological pH range. All these factors further shape the mobility, permeability, stability, and mode of action of substances in the body (<xref ref-type="bibr" rid="B22">Manallack et al., 2013</xref>). In case of insufficient or missing empirical data, the correct determination of pK<sub>
<italic>a</italic>
</sub> values is thus essential to correctly predict the aforementioned molecular properties.</p>
<p>Authors and studies disagree on the exact percentage of drugs with ionizable groups, but a conservative estimate suggests that at least two-thirds of all drugs contain one or more ionization groups (in a pH range of 2&#x2013;12) (<xref ref-type="bibr" rid="B23">Manallack, 2007</xref>). The importance of pK<sub>
<italic>a</italic>
</sub> predictions for drug discovery has been widely recognized and has been the topic of multiple blind predictive challenges&#x2014;most notable the Statistical Assessment of Modeling of Proteins and Ligands (SAMPL) series SAMPL6 (<xref ref-type="bibr" rid="B16">I&#x15f;&#x131;k et al., 2021</xref>), SAMPL7, (<xref ref-type="bibr" rid="B3">Bergazin et al., 2021</xref>) and ongoing SAMPL8 <xref ref-type="fn" rid="fn10">
<sup>1</sup>
</xref> challenge.</p>
<p>Multiple methods have been developed to estimate pK<sub>
<italic>a</italic>
</sub> values of small molecules, ranging from physical models based on quantum chemistry calculations (<xref ref-type="bibr" rid="B38">Selwa et al., 2018</xref>; <xref ref-type="bibr" rid="B40">Tielker et al., 2018</xref>) and/or free energy calculations (<xref ref-type="bibr" rid="B33">Prasad et al., 2018</xref>; <xref ref-type="bibr" rid="B45">Zeng et al., 2018</xref>) to empirical models based on linear free energy relationships using the Hammet-Taft equation or more data driven methods using quantitative structure-property relationship (QSPR) and machine learning (ML) approaches like deep neural network or random forest models (<xref ref-type="bibr" rid="B19">Liao and Nicklaus, 2009a</xref>; <xref ref-type="bibr" rid="B37">Rupp et al., 2011</xref>; <xref ref-type="bibr" rid="B24">Mansouri et al., 2019</xref>; <xref ref-type="bibr" rid="B1">Baltruschat and Czodrowski, 2020a</xref>; <xref ref-type="bibr" rid="B3">Bergazin et al., 2021</xref>). In general empirical methods require significantly less computational effort than their physics-based counterparts once they are parameterized but require a relatively large number of high-quality data points as training set (<xref ref-type="bibr" rid="B3">Bergazin et al., 2021</xref>).</p>
<p>In recent years, machine learning methods have been widely applied to predict different molecular properties including pK<sub>
<italic>a</italic>
</sub> predictions. Many of these approaches learn pK<sub>
<italic>a</italic>
</sub> values on fingerprint representations of molecules (<xref ref-type="bibr" rid="B1">Baltruschat and Czodrowski, 2020a</xref>; <xref ref-type="bibr" rid="B44">Yang et al., 2020</xref>). The pK<sub>
<italic>a</italic>
</sub> value of an acid and conjugate base pair is determined by the molecular structure and the molecular effects on the reaction center exerted by its neighborhood, including mesomeric, inductive, steric, and entropic effects (<xref ref-type="bibr" rid="B32">Perrin et al., 1981</xref>). Ideally, these effects should be included and encoded in a suitable fingerprint or set of descriptors. For many applications, extended-connectivity fingerprints (ECFPs) in combination with molecular features have proven to be a suitable and powerful tool to learn structure-property relationships (<xref ref-type="bibr" rid="B35">Rogers and Hahn, 2010</xref>; <xref ref-type="bibr" rid="B17">Jiang et al., 2021</xref>).</p>
<p>The emergence of graph neural networks (GNNs) has shifted some focus from descriptors and fingerprints designed by domain experts to these emerging deep learning methods. GNNs are a class of deep learning methods designed to perform inference on data described by graphs and provide straightforward ways to perform node-level, edge-level, and graph-level prediction tasks (<xref ref-type="bibr" rid="B42">Wu et al., 2019</xref>; <xref ref-type="bibr" rid="B41">Wieder et al., 2020</xref>; <xref ref-type="bibr" rid="B46">Zhou et al., 2020</xref>). GNNs are capable of learning representations and features for a specific task in an automated way eliminating the need for excessive feature engineering ((<xref ref-type="bibr" rid="B11">Gilmer et al., 2017</xref>)). Another aspect of their attractiveness for molecular property prediction is the ease with which a molecule can be described as an undirected graph, transforming atoms to nodes and bonds to edges encoded both atom and bond properties. GNNs have proven to be useful and powerful tools in the machine learning molecular modeling toolbox (<xref ref-type="bibr" rid="B11">Gilmer et al., 2017</xref>; <xref ref-type="bibr" rid="B8">Deng et al., 2021</xref>).</p>
<p>Pan et al. (<xref ref-type="bibr" rid="B30">Pan et al., 2021</xref>) have shown that GNNs can be successfully applied to pK<sub>
<italic>a</italic>
</sub> predictions of chemical groups of a molecule, outperforming more traditional machine learning models relying on human-engineered descriptors and fingerprints, developing MolGpka, a web server for predicting pK<sub>
<italic>a</italic>
</sub> values. MolGpka was trained on molecules extracted from the ChEMBL database (<xref ref-type="bibr" rid="B10">Gaulton et al., 2012</xref>) containing predicted pK<sub>
<italic>a</italic>
</sub> values (predicted with ACD/Labs Physchem software<xref ref-type="fn" rid="fn11">
<sup>2</sup>
</xref>). Only the most acidic and most basic pK<sub>
<italic>a</italic>
</sub> values were considered for the training of the GNN models.</p>
<p>The goal of this work was to extend the scope of predicting pK<sub>
<italic>a</italic>
</sub> values for independently ionizable atoms (realized in MolGpka) and develop a workflow that is able to enumerate protonation states and predict the corresponding pK<sub>
<italic>a</italic>
</sub> values connecting them (sometimes referred to as &#x201c;sequential pK<sub>
<italic>a</italic>
</sub> prediction&#x201d;). To achieve this we implemented and trained a GNN model that is able to predict values for both acidic and basic groups by considering the protonated and deprotonated species involved in the corresponding acid-base reaction. We trained the model in two stages. First, we started by pre-training the model on calculated microstate pK<sub>
<italic>a</italic>
</sub> values for a large set of molecules obtained from the ChEMBL database (<xref ref-type="bibr" rid="B10">Gaulton et al., 2012</xref>). The pre-trained model already performs well on the two independent test sets used to measure the performance of the trained models. To improve its performance we fine-tuned the model on a small training set of molecules for which experimental pK<sub>
<italic>a</italic>
</sub> values were available. The fine-tuned model shows excellent and improved performance on the two test sets.</p>
<p>We have implemented the training routine and prediction pipeline in an open-source Python package named pkasolver, which is freely available and can be obtained as described in the Code and data availability section. Due to the terms of its licence agreement we are unable to distribute models trained using results generated with Epik. Users with an Epik licence can follow the instructions outlined in the data repository to obtain the fine-tuned models. For users without such a licence we provide models trained without Epik. We also provide a ready-to-use Google Colab Jupyter notebook which includes trained models and can be used to predict pk<sub>
<italic>a</italic>
</sub> values for molecules without locally installing the package (for further information see the Code and data availability section) (<xref ref-type="bibr" rid="B4">Bisong, 2019</xref>).</p>
</sec>
<sec sec-type="results|discussion" id="s2">
<title>2 Results and Discussion</title>
<p>We will start by discussing the performance of the model on the validation set of the ChEMBL data set (which contains pK<sub>
<italic>a</italic>
</sub> values calculated with Epik on a subset of the ChEMBL database) and the two independent test sets: the Novartis test set (280 molecules) and the Literature test set (123 molecules). This will be followed by a discussion of the fine-tuned model on its validation set (experimental data set), on both test sets, and on the ChEMBL data set. Subsequently, we will discuss the performance of the models trained only on the monoprotic experimental data set (without transfer learning). Finally, we will discuss the developed pkasolver package, its use cases, and limitations.</p>
<p>Performance of the different predictive models is subsequently reported using the mean absolute error (MAE) and root mean squared error (RMSE). For each metric (MAE and RMSE) the median value from 50 repetitions with different training/validation set splits is reported and the 90% confidence interval is shown. To visualize training results a single training run (out of the 50) was randomly selected and the results on the validation set plotted.</p>
<p>In the following sections we will use the term pkasolver to describe the sequential pK<sub>
<italic>a</italic>
</sub> prediction pipeline using trained GNN models. To distinguish between the transfer learning approach (models trained both on the and experimental data set) and the models trained <italic>only</italic> on the experimental data set we will indicate the former with pkasolver-epic and the latter with the keyword pkasolver-light.</p>
<sec id="s2-1">
<title>2.1 Pre-Training Model Performance</title>
<p>The initial training of the GNN model was performed using the ChEMBL data set (microstate pK<sub>
<italic>a</italic>
</sub> values calculated with Epik). <xref ref-type="sec" rid="s10">Supplementary Figure S3A</xref> shows the results of the best performing model on the hold-out validation set. The MAE and RMSE are 0.29 [90% CI: 0.28; 0.31] and 0.45 [90% CI:0.44;0.49] pK<sub>
<italic>a</italic>
</sub> units shows a good fit across the reference pK<sub>
<italic>a</italic>
</sub> values. The kernel density estimates (KDE) of the distribution of the reference and predicted pK<sub>
<italic>a</italic>
</sub> values shown in <xref ref-type="sec" rid="s10">Supplementary Figure S3A</xref> highlights the ability of the GNN to correctly learn to predict pK<sub>
<italic>a</italic>
</sub> values throughout the investigated pH range.</p>
<p>The performance of the trained GNN model was assessed on two independent test sets: the Novartis and the Literature test set (both test sets are described in detail in the Methods section) (<xref ref-type="bibr" rid="B1">Baltruschat and Czodrowski, 2020a</xref>). The trained model performs well on both test sets with a MAE of 0.62 [90% CI:0.57;0.67] and a RMSE of 0.97 [90% CI:0.89;1.10] pk<sub>
<italic>a</italic>
</sub> units on the Literature test set and a MAE of 0.82 [90% CI:0.77;0.85] and a RMSE of 1.13 [90% CI:1.05;1.21] pk<sub>
<italic>a</italic>
</sub> units on the Novartis test set (shown in <xref ref-type="sec" rid="s10">Supplementary Figure S2</xref>). The performance is comparable to the performance of Epik and Marvin on both test sets (shown in <xref ref-type="table" rid="T1">Table 1</xref>).</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Performance of state-of-the-art knowledge-based approaches and commercial software solutions to predict pK<sub>
<italic>a</italic>
</sub> values on the Novartis and Literature test sets are shown. For each data set, the mean absolute error (MAE) and root mean squared error (RMSE) is calculated. For MolGpKa, Epik, pkasolver-epic, and pkasolver-light the median value and the 90% confidence interval are reported.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Model</th>
<th colspan="2" align="center">Novartis data set</th>
<th colspan="2" align="center">Literature data set</th>
</tr>
<tr>
<th align="center">MAE</th>
<th align="center">RMSE</th>
<th align="center">MAE</th>
<th align="center">RMSE</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Random Forest<xref ref-type="table-fn" rid="Tfn1">
<sup>1</sup>
</xref>
<sup>,</sup>
<xref ref-type="table-fn" rid="Tfn3">
<sup>3</sup>
</xref>
</td>
<td align="center">1.15</td>
<td align="center">1.51</td>
<td align="center">0.53</td>
<td align="center">0.76</td>
</tr>
<tr>
<td align="left">ChemAxon Marvin (V20.1.0)<xref ref-type="table-fn" rid="Tfn3">
<sup>3</sup>
</xref>
</td>
<td align="center">0.86</td>
<td align="center">1.17</td>
<td align="center">0.57</td>
<td align="center">0.87</td>
</tr>
<tr>
<td align="left">MolGpKa <xref ref-type="bibr" rid="B30">Pan et al. (2021</xref>)</td>
<td align="center">0.87 [0.77;0.97]</td>
<td align="center">1.27 [1.08;1.45]</td>
<td align="center">0.49 [0.40;0.65]</td>
<td align="center">1.00 [0.56;1.53]<xref ref-type="table-fn" rid="Tfn4">
<sup>4</sup>
</xref>
</td>
</tr>
<tr>
<td align="left">Epik<xref ref-type="table-fn" rid="Tfn2">
<sup>2</sup>
</xref> <xref ref-type="bibr" rid="B30">Pan et al. (2021</xref>)</td>
<td align="center">0.83 [0.75;0.91]</td>
<td align="center">1.16 [1.06;1.26]</td>
<td align="center">0.58 [0.48;0.67]</td>
<td align="center">0.92 [0.74;1.12]</td>
</tr>
<tr>
<td align="left">pkasolver-epic</td>
<td align="center">0.71 [0.64;0.74]</td>
<td align="center">0.93 [0.85;0.97]</td>
<td align="center">0.52 [0.49;0.56]</td>
<td align="center">0.82 [0.76;0.86]</td>
</tr>
<tr>
<td align="left">pkasolver-light</td>
<td align="center">0.86 [0.81;0.94]</td>
<td align="center">1.13 [1.04;1.20]</td>
<td align="center">0.56 [0.51;0.64]</td>
<td align="center">0.82 [0.71;0.93]</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="Tfn1">
<label>1</label>
<p>Used a random forest implementation with 1,000 estimators and the FCFP6 fingerprint. Values for the best performing random forest implementation are shown.</p>
</fn>
<fn id="Tfn2">
<label>2</label>
<p>Epik identified different protonation centers than were reported in the data sets for the Novartis data set for 26 out of 280 molecules. These molecules were excluded from the MAE and RMSE calculation for Epik.</p>
</fn>
<fn id="Tfn3">
<label>3</label>
<p>values were obtained from Baltruschat and Czodrowski (<xref ref-type="bibr" rid="B1">Baltruschat and Czodrowski, 2020a</xref>).</p>
</fn>
<fn id="Tfn4">
<label>4</label>
<p>the reason for the large confidence interval is the incorrect prediction for a single molecule (Isomeric Smiles: CCNC) by MolGpKa with an error of 8.86 pK<sub>
<italic>a</italic>
</sub> units</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s2-2">
<title>2.2 Fine-Tuned Model Performance</title>
<p>While the performance on the test sets of the pre-trained model was already acceptable we were able to further increase model accuracy by fine-tuning the pre-trained model using a data set of experimentally measured pK<sub>
<italic>a</italic>
</sub> values. The performance of the fine-tuned model on the validation set of the experimental data set is shown in <xref ref-type="sec" rid="s10">Supplementary Figure S3B</xref>. The median performance of the fine-tuned model was improved from a RMSE of 0.97 [90% CI:0.89;1.10] to 0.82 [90% CI:0.76;0.88] pK<sub>
<italic>a</italic>
</sub> units on the Literature test set and from a RMSE of 1.13 [90% CI:1.05;1.21] to 0.93 [90% CI:0.85;0.97] pK<sub>
<italic>a</italic>
</sub> units on the Novartis test set (shown in <xref ref-type="fig" rid="F1">Figure 1</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>The fine-tuned GNN model is able to predict the pK<sub>
<italic>a</italic>
</sub> values of the Novartis and Literature test set with high accuracy. Panel <bold>(A)</bold> shows the performance of the fine-tuned model (initially trained with the ChEMBL data set and subsequently fine-tuned on the experimental data set) on the Literature test set. Panel <bold>(B)</bold> shows the performance of the same model on the Novartis test set. The solid red line in the scatter plot indicates the ideal behavior of the reference and predicted pK<sub>
<italic>a</italic>
</sub> values, the dashed lines mark the &#xb1;1 pk<sub>
<italic>a</italic>
</sub> unit interval. Mean absolute error (MAE) and root mean squared error (RMSE) is shown, the values in bracket indicate the 90% confidence interval calculated from 50 repetitions with random training/validation splits. <italic>N</italic> indicates the number of investigated samples.</p>
</caption>
<graphic xlink:href="fchem-10-866585-g001.tif"/>
</fig>
<p>In order to avoid model performance degradation on the ChEMBL data set we randomly added molecules from the ChEMBL data set during the fine-tuning workflow. Adding molecules from the ChEMBL data set to restrict model parameters and avoid overfitting decreased the performance of the fine-tuned model on the ChEMBL data set only slightly (shown in <xref ref-type="sec" rid="s10">Supplementary Figure S4</xref>). This was necessary since previous attempts without regularization showed decreased accuracy of the fine-tuned model in regions outside the limited pH range of the experimental data set while improving the performance on the test sets (details to the pH range of both the ChEMBL and experimental data set are shown in <xref ref-type="sec" rid="s10">Supplementary Figure S6</xref>). An example of the performance of the fine-tuned model on the ChEMBL data set without regularization is shown in <xref ref-type="sec" rid="s10">Supplementary Figure S7</xref>.</p>
<p>To set the performance of the fine-tuned model in context we compare its performance with two recent publications investigating pK<sub>
<italic>a</italic>
</sub> predictions using machine learning. In <xref ref-type="table" rid="T1">Table 1</xref> the results are summarized for the methods presented in both Baltruschat and Czodrowski (<xref ref-type="bibr" rid="B1">Baltruschat and Czodrowski, 2020a</xref>) and Pan et al. (<xref ref-type="bibr" rid="B30">Pan et al., 2021</xref>). We extracted data from these publications where appropriate and recalculated values if needed. Pan et al. (<xref ref-type="bibr" rid="B30">Pan et al., 2021</xref>) split the reported results into basic and acidic groups making it necessary to recalculate the values reported there for MolGpKa and Epik, the values for Marvin were taken directly from reference (<xref ref-type="bibr" rid="B1">Baltruschat and Czodrowski, 2020a</xref>) (reported values were calculated without confidence interval). The fine-tuned GNN model (shown as pkasolver-epic in <xref ref-type="table" rid="T1">Table 1</xref>) performs on a par with the best performing methods reported there.</p>
<p>It is difficult to rationalize MAE/RMSE differences between different methods/models shown in <xref ref-type="table" rid="T1">Table 1</xref>) since training sets and methods are different. The small difference in performance between pkasolver-epic and MolGpka could be attributed to the transfer learning routine which added experimentally measured pK<sub>
<italic>a</italic>
</sub> values. The random forest model was trained on significantly less data (only on the 5,994 pk<sub>
<italic>a</italic>
</sub> values present in the experimental data set) than either pkasolver or MolGpka yet performs comparably to both on the Literature data set while significantly worse on the Novartis data set. This might highlight the complexity of the Novartis data set, an observation previously made and investigated in Pan et al. (<xref ref-type="bibr" rid="B30">Pan et al., 2021</xref>).</p>
<p>Both Epik and Marvin perform well on both test data sets. It is surprising that pkasolver-epic can slightly outperform Epik, even though its initial training was based on data calculated by Epik. We think this emphasizes the potential of transfer learning as used in this work and data-driven deep learning in general.</p>
</sec>
<sec id="s2-3">
<title>2.3 Training on the Experimental Data Set Without Transfer Learning</title>
<p>To provide a ready-to-use pK<sub>
<italic>a</italic>
</sub> prediction pipeline for which we can distribute the trained models under the MIT licence we trained models exclusively on the experimental data set. The performance on the Novartis and Literature data set of these models is shown in <xref ref-type="sec" rid="s10">Supplementary Figure S5</xref> and summarized in <xref ref-type="table" rid="T1">Table 1</xref> (shown as pkasolver-light). While the results are comparable to Epik and MolGpKa on the test sets it is important to stress that both test sets contain only monoprotoic molecules (<xref ref-type="bibr" rid="B1">Baltruschat and Czodrowski, 2020a</xref>).</p>
</sec>
<sec id="s2-4">
<title>2.4 Sequential pK<sub>
<italic>a</italic>
</sub> Predictions With Pkasolver</title>
<p>Combining the trained GNN models with Dimorphite-DL, a tool that identifies potential protonation sites and enumerates protonation states, enabled us to perform sequential pK<sub>
<italic>a</italic>
</sub> predictions. A detailed description of this approach is given in the Detailed methods section. We investigated multiple mono- and polyprotic molecules for qualitative and quantitative agreement between prediction and experimental data. The results for the investigate systems were of excellent consistency using pkasolver-epic and of reasonable accuracy using pkasolver-light. The list of molecules that we tested is included in the pkasolver repository and a subset of molecules of general interest for drug discovery are discussed in detail in the <xref ref-type="sec" rid="s10">Supplementary Materials</xref> section.</p>
</sec>
<sec id="s2-5">
<title>2.5 Limiations of Pkasolver</title>
<p>The sequential pK<sub>
<italic>a</italic>
</sub> prediction of pkasolver generates microstates and the calculated pK<sub>
<italic>a</italic>
</sub> values are microstate pK<sub>
<italic>a</italic>
</sub> values. One limitation of pkasolver is that only a single microstate per macrostate is generated. Tautomeric and mesomeric states are <italic>never</italic> changed during the sequential de-/protonation (i.e., double bond positions are fixed). For each protonation state the bond patter of the molecule that was proposed by Dimorphite-DL at pH 7.4 is used. This shortcoming has several consequences. First, it leads to unusual protonation states. One example that has been observed throughout the sequential pK<sub>
<italic>a</italic>
</sub> prediction tests with pkasolver-epic are amide groups with a negative charge on the nitrogen atom. The more likely position of the charge is the more electronegative oxygen atom. This has little practical consequence since this pattern was also present in the pK<sub>
<italic>a</italic>
</sub> prediction training set generated with Epik (the mesomeric state was fixed in training too). A far more severe limitation is the fact that it is not possible to model microstates within a singe macrostate, since tautomers can not be changed (<xref ref-type="bibr" rid="B13">Gunner et al., 2020</xref>). To overcome this limitation it is necessary to enumerate tautomers for each protonation state and estimate their relative population. Solving this particular problem will be part of future work.</p>
<sec id="s2-5-1">
<title>2.5.1 Limitations of Pkasolver-Light</title>
<p>The training set of pkasolver-light contains only monoprotic pK<sub>
<italic>a</italic>
</sub> data with the majority of pK<sub>
<italic>a</italic>
</sub> values between 4 and 10 (as shown in <xref ref-type="sec" rid="s10">Supplementary Figure S6</xref>) (<xref ref-type="bibr" rid="B1">Baltruschat and Czodrowski, 2020a</xref>). The trained models are not necessarily suitable for polyprotic molecules. This limitation becomes apparent in the in depth discussion of some mono- and polyprotic molecules discussed in the <xref ref-type="sec" rid="s10">Supplementary Materials</xref> section. For polyprotic molecules it is highly recommended to use pkasolver-epic instead of the pkasolver-light.</p>
</sec>
<sec id="s2-5-2">
<title>2.5.2 Limitations of Pkasolver-Epic</title>
<p>The pre-training data set imposes limitations on the applicability domain of the pK<sub>
<italic>a</italic>
</sub> predictions with pkasolver-epic. The selection criteria of the pre-training data set are described in the Methods section. In <xref ref-type="sec" rid="s10">Supplementary Figure S8</xref> the distribution of several molecular properties (molecular weight, number of heteroatoms, number of hydrogen bond acceptor/donor, frequency of elements) are shown. The transferability of the trained models for molecules outside these distributions has not been tested and the usage of pkasolver-epic for such molecules is not recommended.</p>
</sec>
</sec>
</sec>
<sec id="s3">
<title>3 Detailed Methods</title>
<sec id="s3-1">
<title>3.1 Data Set Generation and Pre-processing</title>
<p>Four different data sets were used in this work: the ChEMBL data set, the experimental data set, the Novartis data set and the Literature data set.</p>
<p>The ChEMBL data set used for pre-training was obtained from the ChEMBL database using the number of Rule-of-Five violations (set to a maximum of one violation) as filter criteria (<xref ref-type="bibr" rid="B10">Gaulton et al., 2012</xref>; <xref ref-type="bibr" rid="B7">Davies et al., 2015</xref>). For each of the molecules, a pK<sub>
<italic>a</italic>
</sub> scan for the pH range between zero and 14 was performed using the Schrodinger tool Epik (<xref ref-type="bibr" rid="B39">Shelley et al., 2007</xref>; <xref ref-type="bibr" rid="B12">Greenwood et al., 2010</xref>) (Version-2121-1). The sequential pK<sub>
<italic>a</italic>
</sub> scan indicated for 320,800 molecules one or multiple protonation state/s, resulting in a total of 729,375 pK<sub>
<italic>a</italic>
</sub> values. For each pK<sub>
<italic>a</italic>
</sub> value, Epik further indicated the protonation center using the atom index of the heavy atom at which either a hydrogen is attached or removed.</p>
<p>To perform transfer learning we obtained a second data set with experimental pK<sub>
<italic>a</italic>
</sub> values. This data set (subsequently called &#x2018;experimental data set&#x2018;) was developed by Baltruschat and Czodrowski (<xref ref-type="bibr" rid="B2">Baltruschat and Czodrowski, 2020b</xref>) and can be acquired from their GitHub repository<xref ref-type="fn" rid="fn3">
<sup>3</sup>
</xref>. For a detailed description of the curating steps taken to generate this data set, we point the reader to the Methods section of (<xref ref-type="bibr" rid="B2">Baltruschat and Czodrowski, 2020b</xref>). The experimental data set consists of 5,994 unique molecules, each with a single pK<sub>
<italic>a</italic>
</sub> value and an atom index indicating the reaction center. Some of the molecules had to be corrected to obtain their protonation state at pH 7.4 (examples shown in ??).</p>
<p>To test the performance of the models, two independent data sets were used, which were provided and curated by Baltruschat and Czodrowski (<xref ref-type="bibr" rid="B2">Baltruschat and Czodrowski, 2020b</xref>). The Literature data set contains 123 compounds collected by manual curating the literature. The Novartis data set contains 280 molecules provided by Novartis (<xref ref-type="bibr" rid="B20">Liao and Nicklaus, 2009b</xref>). For each molecule, a pK<sub>
<italic>a</italic>
</sub> value and atom index indicating the reaction center was provided. To avoid training the model on molecules present in the Literature or Novartis data set we filtered the ChEMBL data set using the InChIKey and canonical SMILES strings of the neutralized molecules as matching criteria. 50 molecules were identified and removed from the ChEMBL data set. All checks were performed using RDKit (<xref ref-type="bibr" rid="B34">RDKit and Open-Source Chemiformatics, 2022</xref>).</p>
</sec>
<sec id="s3-2">
<title>3.2 Enumerate Protonation States During Training/Testing</title>
<p>The goal of calculating microstate pK<sub>
<italic>a</italic>
</sub> values is to find the pH value at which the concentration of two molecular species is equal. To do this efficiently, we provide as input the protonated and deprotonated molecular species of the acid-base pair for which we want to calculate the pK<sub>
<italic>a</italic>
</sub> value (the Br&#xf8;nsted acid/base definitions are used here and subsequently (<xref ref-type="bibr" rid="B25">McNaught and Wilkinson, 2014</xref>)). This approach enables a consistent treatment of acids and bases with a single data structure (the acid-base pair).</p>
<p>This workflow made it necessary that we generate the molecular species at each protonation state starting from the molecule at pH 7.4 by removing or adding hydrogen to the reaction center (which was calculated by Marvin for the experimental, Novartis, and Literature data set and Epik for the ChEMBL data set). We do this by sequentially adding hydrogen atoms from highest to lowest pK<sub>
<italic>a</italic>
</sub> for acids (i.e., at pH &#x3d; 0 all possible protonation sites are protonated) and removing hydrogen atoms from lowest to highest pK<sub>
<italic>a</italic>
</sub> value for bases on the structure present at pH 7.4 (at pH &#x3d; 14 all possible protonation sites are deprotonated).</p>
<p>This approach presented challenges for the ChEMBL data set for which sequential pK<sub>
<italic>a</italic>
</sub> values and reaction centers were calculated with Epik. Epik calculates the microstate pK<sub>
<italic>a</italic>
</sub> value on the most probable tautomeric/mesomeric structure. This leads to potential protonation states that require changes in the double bond pattern and redistribution of hydrogen. Since we do not consider tautomeric changes to the molecular structure in the present implementation, such tautomeric changes can introduce invalid molecules in either the sequential removal or addition of hydrogen atoms. Whenever such molecular structures were encountered we removed these protonation states from further consideration. Additionally, we used RDKit&#x2019;s sanitize function to identify cases for which protonation state changes introduce invalid atom valences. In other cases in which the protonation state change on a mesomeric structure introduces valid yet improbable molecular structures (e.g. protonating the oxygen in an amide instead of the nitrogen) we keep these structures. This reduced the number of molecules and protonation states in the ChEMBL data set to 286,816 molecules and 714,906 protonation states. The distribution of pK<sub>
<italic>a</italic>
</sub> values for the ChEMBL and experimental data set is shown in <xref ref-type="sec" rid="s10">Supplementary Figure S6</xref>.</p>
</sec>
<sec id="s3-3">
<title>3.3 Training and Testing With PyTorch Geometric</title>
<p>We use PyTorch and PyTorch geometric (subsequently abbreviated as PyG) for model training, testing, and prediction of pK<sub>
<italic>a</italic>
</sub> values on the graph data structures (<xref ref-type="bibr" rid="B9">Fey and Lenssen, 2019</xref>; <xref ref-type="bibr" rid="B31">Paszke et al., 2019</xref>).</p>
<sec id="s3-3-1">
<title>3.3.1 Graph Data Structure</title>
<p>A graph <italic>G</italic> is defined as a set of no the nodes V and edges E connecting the nodes. Each node <italic>v</italic> &#x2208; <italic>V</italic> has a feature vector <italic>x</italic>
<sub>
<italic>v</italic>
</sub>, which encodes atom properties like element, charge, number of hydrogen, as well as the presence of particular SMARTS patterns as a one-hot-encoding bit vector (all atom properties are shown in <xref ref-type="sec" rid="s10">Supplementary Table S1</xref>). The adjacency matrix <italic>A</italic> defines the connectivity of the graph. <italic>A</italic> is defined as a quadratic matrix with <italic>A</italic>
<sub>
<italic>uv</italic>
</sub> &#x3d; 1 if there is an edge between node <italic>u</italic> and <italic>v</italic> and <italic>A</italic>
<sub>
<italic>uv</italic>
</sub> &#x3d; 0 if there is no edge between node <italic>u</italic> and <italic>v</italic>.</p>
<p>We used RDKit to generate a graph representation of the molecule with atoms represented as nodes and bonds as edges (in coordinate list format <xref ref-type="fn" rid="fn4">
<sup>4</sup>
</xref> to efficiently represent the spares matrix).</p>
</sec>
<sec id="s3-3-2">
<title>3.3.2 Graph Neural Network Architecture</title>
<p>To predict a single pk<sub>
<italic>a</italic>
</sub> value the graph neural network (GNN) architecture takes as input two graphs representing the conjugated acid-base pair as shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. <xref ref-type="fig" rid="F2">Figure 2B</xref> shows the high-level architecture of the used GNN.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Panel <bold>(A)</bold> shows the general workflow used to train the GNN on pK<sub>
<italic>a</italic>
</sub> values for a single molecule. During the training and testing phase, each molecule was provided in the structure dominant at pH 7.4 with atom indices indicating the protonation sites and corresponding pK<sub>
<italic>a</italic>
</sub> values connecting them. In the Enumeration of protonation states phase we generate the protonation state for each pK<sub>
<italic>a</italic>
</sub> value. The molecular species for each of the protonation states are then translated in their graph representation using nodes for atoms and edges for bonds, with node feature vectors encoding atom properties in the Graph representation phase. In the pK<sub>
<bold>
<italic>a</italic>
</bold>
</sub> prediction phase graphs of two neighboring protonation states are combined and used as input for the GNN model to predict the pK<sub>
<italic>a</italic>
</sub> value for the acid-base pair [using the Br&#xf8;nsted&#x2013;Lowry acid/base definition (<xref ref-type="bibr" rid="B25">McNaught and Wilkinson, 2014</xref>)]. The architecture of the GNN model is shown in detail in panel <bold>(B)</bold>. For a pair of neighboring protonation states two independent GIN (graph isomorphism network) convolution layers and ReLU activation functions are used for the protonated and the deprotonated molecular graph to pass information of neighboring atoms and achieve the embedding of the chemical environment of each atom (<xref ref-type="bibr" rid="B43">Xu et al., 2019</xref>). The output of the convolutional layer is summarized using a global average pooling layer, generating the condensed input for the multilayer perceptron (MLP). To add regularization and to prevent co-adaptation of neurons a dropout layer was added.</p>
</caption>
<graphic xlink:href="fchem-10-866585-g002.tif"/>
</fig>
<p>There are three phases to predict a pK<sub>
<italic>a</italic>
</sub> value from a pair of molecular graphs. The first stage involves recurrently updating the node states using GIN (graph isomorphism network) convolution layers and ReLU activation functions (<xref ref-type="bibr" rid="B43">Xu et al., 2019</xref>). We used 3 GIN layers with an embedding size of 64 bits each to propagate information throughout the graph and update each node with information about the extended environment. In the second stage, a global average pooling is performed to produce the embedding of the protonated and deprotonated graph, resulting in two 32 bit vectors. Concatenating the two 32 bit vectors produces the input for the third stage, the multilayer perceptron (MLP) with 3 fully connected layers (each with an embedding size of 64). To add regularization and to prevent co-adaptation of neurons a dropout layer randomly zeros out elements of the pooling output vector with <italic>p</italic> &#x3d; 0.5 during training. Additionally, batch normalization is applied as described in (<xref ref-type="bibr" rid="B14">Ioffe and Szegedy, 2015</xref>).</p>
</sec>
<sec id="s3-3-3">
<title>3.3.3 GNN Model Training</title>
<p>Before each training run the ChEMBL and experimental data set were shuffled and randomly split in training (90% of the data) and validation set (10% of the data). To ensure that we can reproduce these splits the seed for each split was recorded.</p>
<p>The mean squared error (MSE) of predicted and reference pK<sub>
<italic>a</italic>
</sub> values on the training data set was calculated and parameter optimization was performed using the Adam optimizer with decoupled weight decay regularization (<xref ref-type="bibr" rid="B21">Loshchilov and Hutter, 2019</xref>) as implemented in PyTorch.</p>
<p>Model performance was evaluated on the validation set and the model with the best performance was selected either for fine-tuning or further evaluation on the test data sets. The performance on the evaluation data set was calculated after every fifth epoch and the corresponding weights were saved. The learning rate for all training runs was dynamically reduced by a factor of 0.5 if the validation set performance did not change within 150 epochs (validation set performance threshold was set to 0.1).</p>
<p>Pre-training of the GNN was performed on the ChEMBL data set with a learning rate of 1<italic>x</italic>10<sup>&#x2212;3</sup> and a batch size of 512 molecules for 1,000 epochs. Fine-tuning was performed using the experimental data set with a learning rate of 1<italic>x</italic>10<sup>&#x2212;3</sup> and a batch size of 64 molecules for 1,000 epochs. All parameters of the GNN models were optimized during fine-tuning. To avoid overfitting to the experimental data set we added to each batch of the fine-tuning data set a randomly selected batch (1,024 molecules) of the pre-training data set.</p>
<p>To calculate the confidence intervals of the model performance, pre-training and fine-tuning were repeated 50 times, each with a random training-validation set split resulting in 50 independently fine-tuned models.</p>
</sec>
</sec>
<sec id="s3-4">
<title>3.4 Sequential pK<sub>
<italic>a</italic>
</sub> Value Prediction With Pkasolver</title>
<p>We use Dimorphite-DL to identify the proposed structure at pH 7.4 and all de-/protonation sites for a given molecule (<xref ref-type="bibr" rid="B36">Ropp et al., 2019</xref>).</p>
<p>We iteratively protonate each of the proposed de-/protonation sites generating a molecular pair consisting of the protonated and deprotonated molecular species (in the first iteration the deprotonated molecule is the molecule at pH 7.4). For each of the protonate/deprotonated pairs a pK<sub>
<italic>a</italic>
</sub> value is calculated. The protonated structure with the highest pK<sub>
<italic>a</italic>
</sub> value (but below pH 7.4) is kept and the protonation site is removed from the list of possible protonation sites. This is repeated until either (1) all protonation sites are protonated, (2) no more valid molecules can be generated, or (3) the calculated pK<sub>
<italic>a</italic>
</sub> values are outside the allowed pK<sub>
<italic>a</italic>
</sub> range.</p>
<p>To enumerate all deprotonated structures we start again with the structure at pH 7.4 and start to iteratively deprotonate each of the proposed de-/protonation sites. Here, we always keep the deprotonated structure with the lowest pK<sub>
<italic>a</italic>
</sub> value that is above 7.4.</p>
<p>pK<sub>
<italic>a</italic>
</sub> values are calculated using 25 of the 50 fine-tuned GNN models. For each protonation state, the average pK<sub>
<italic>a</italic>
</sub> value is calculated and the standard deviation is shown to enable the user to identify molecules or protonation states for which the GNN model estimates are uncertain.</p>
<p>We provide a ready to use implementation of pkasolver to predict sequential pK<sub>
<italic>a</italic>
</sub> values in our GitHub repository (for further information see the Code and data availability section).</p>
</sec>
</sec>
<sec id="s4">
<title>4 Conclusion</title>
<p>We have shown that GNNs can be used to predict mono- and polyprotic pK<sub>
<italic>a</italic>
</sub> values and achieve excellent performance on two external test sets. Training the GNN model in two stages with a pre-training phase using a large set of molecules with calculated pK<sub>
<italic>a</italic>
</sub> values and a fine-tuning phase on a small set of molecules with experimentally measured pK<sub>
<italic>a</italic>
</sub> values improves the performance of the GNN model significantly. This performance boost is especially noteworthy on the challenging Novartis test set (the RMSE was decreased from 1.18 [1.05;1.27] to 0.93 [0.85;0.97] pk<sub>
<italic>a</italic>
</sub> units). A direct comparison with other software solutions and machine learning models on the two test sets shows that the fine-tuned GNN model performs consistently on a par with the best results of other commercial and non-commercial tools.</p>
<p>We have implemented pkasolver as an open-source and free-to-use Python package under a permissive licence (MIT licence). We provide two versions of the package: pkasolver-epic and pkasolver-light. The former performs best on both test sets and is suitable for sequential pK<sub>
<italic>a</italic>
</sub> prediction on poloyprotic molecules. It was pretraind on a subset of the ChEMBL data set for which pK<sub>
<italic>a</italic>
</sub> values were predicted using Epik and fine-tuned on experimental monoprotic pK<sub>
<italic>a</italic>
</sub> values. Due to the terms of the licence agreement of Epik we are unable to supply the trained models but provide the training pipeline to reproduce the models (which requires an active Epik license). pkasolver-light performs well on both test sets but its application domain is limited to monoprotic molecules. These are the trained models distributed with the pkasolver package.</p>
</sec>
</body>
<back>
<sec id="s5">
<title>Data Availability Statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/<xref ref-type="sec" rid="s10">Supplementary Material</xref>. Python package used in this work (release v0.3) and Colabs Jupyter notebook link: <ext-link ext-link-type="uri" xlink:href="https://github.com/mayrf/pkasolver">https://github.com/mayrf/pkasolver</ext-link>. Data and notebooks to reproduce the plots/figures (release v0.2): <ext-link ext-link-type="uri" xlink:href="https://github.com/wiederm/pkasolver-data">https://github.com/wiederm/pkasolver-data</ext-link>.</p>
</sec>
<sec id="s6">
<title>Author Contributions</title>
<p>Conceptualization: FM, OW, TL, and MW; Methodology: FM, OW, and MW; Software: FM and MW; Investigation: FM and MW; Writing&#x2013;Original Draft: FM, OW, and MW; Writing&#x2013;Review and Editing: FM, OW, TL, and MW; Funding Acquisition: MW, TL; Resources: TL; Supervision: MW, TL.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>MW acknowledges support from an FWF Erwin Schr&#xf6;dinger Postdoctoral Fellowship J 4245-N28. FM and TL gratefully acknowledge funding by the NeuroDeRisk project (<ext-link ext-link-type="uri" xlink:href="https://www.neuroderisk.eu">https://www.neuroderisk.eu</ext-link>), which has received funding from the Innovative Medicines Initiative 2 Joint Undertaking (IMI2 JU, <ext-link ext-link-type="uri" xlink:href="https://european-union.europa.eu/institutions-law-budget/institutions-and-bodies/institutions-and-bodies-profiles/imi-2-ju_en">https://european-union.europa.eu/institutions-law-budget/institutions-and-bodies/institutions-and-bodies-profiles/imi-2-ju_en</ext-link>) under Grant Agreement No. 821528. This Joint Undertaking receives support from the European Union&#x2019;s Horizon 2020 research and innovation program and the European Federation of Pharmaceutical Industries and Associations (EFPIA, <ext-link ext-link-type="uri" xlink:href="https://www.efpia.eu">https://www.efpia.eu</ext-link>).</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ack>
<p>MW is grateful for discussions with David Bushiri, John Chodera, Josh Fass, Nils Krieger, Magdalena Wiercioch, Steffen Hirte, Thomas Seidel, and the Tautomer Consortium, specifically Paul Czodrowski, Brian Radak, Woody Sherman, David Mobley, Christopher Bayly, and Stefan Kast. MW, OW, FM, and TL are grateful for the help of Gerhard F. Ecker and his group members who performed the reference pK<sub>
<italic>a</italic>
</sub> calculations with Epik for the ChEMBL data set.</p>
</ack>
<sec id="s10">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fchem.2022.866585/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fchem.2022.866585/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material>
<label>Supplementary Figure S1</label>
<caption>
<p>Protonation state errors in the experimental data set. This exemplary selection shows molecules from the experimental data set provided by Baltruschat and Czodrowski (<xref ref-type="bibr" rid="B1">Baltruschat and Czodrowski, 2020a</xref>) for which the protonation state provided does not correspond to the state at pH 7.4. For examples 1, 2, 4 and 5 with experimental <italic>pK</italic>
<sub>
<italic>a</italic>
</sub> values below 7.4 protonation at the reaction center would result in highly unlikely pentavalent nitrogen. For example 3 and 6 with <italic>pK</italic>
<sub>
<italic>a</italic>
</sub> values above 7.4 deprotonation at the reaction site can not be performed because of the lack of a suitable hydrogen. These error were corrected during our data preparation.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S2</label>
<caption>
<p>Performance of the pre-trained GNN model on the Novartis and Literature test set is shown. 50 training runs with different training/validation splits were performed and for each training run the best model was selected based on its performance on the validation set (shown here is a single, randomly selected training run). Panel <bold>(A)</bold> shows the performance of the GNN model on the Literature data set. Panel <bold>(B)</bold> shows the performance of the GNN model on the Novartis data set. The solid red line in the scatter plot indicates the ideal behavior of the reference and predicted pK<sub>
<italic>a</italic>
</sub> values, the dashed lines mark the &#xb1;1 pk<sub>
<italic>a</italic>
</sub> unit interval. Mean absolute error (MAE) and root mean squared error (RMSE) are shown, the values in bracket indicate the 90% confidence interval calculated from 50 repetitions with random training/validation splits. <italic>N</italic> indicates the number of investigated samples.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S3</label>
<caption>
<p>Performance of the pre-trained and fine-tuned models are shown on the respective validation sets. 50 training runs with different training/validation splits were performed and for each training run the best model was selected based on its performance on the validation set (shown here is a single, randomly selected training run). Panel <bold>(A)</bold> shows the validation set performance of the best GNN model trained on the ChEMBL data set. Panel <bold>(B)</bold> shows the validation set performance starting from the same pre-trained model after fine-tuning on the experimental training set. The solid red line in the scatter plot indicates the ideal behavior of the reference and predicted pK<sub>
<italic>a</italic>
</sub> values, the dashed lines mark the &#xb1;1 pk<sub>
<italic>a</italic>
</sub> unit interval. Mean absolute error (MAE) and root mean squared error (RMSE) are shown, the values in bracket indicate the 90% confidence interval calculated from 50 repetitions with random training/validation splits. <italic>N</italic> indicates the number of investigated samples.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S4</label>
<caption>
<p>The accuracy of the fine-tuned GNN model only decreases slightly when molecules from the ChEMBL data set are used for regularization. 50 fine-tuning runs with different training/validation splits were performed, each initialized using the parameters of 50 pre-training runs, and for each training run the best model was selected based on its performance on the validation set. In order to generate a single plot we selected randomly a single fine-tuning run and generated the scatter plot with the best performing model on the validation set. The solid red line in the scatter plot indicates the ideal behavior of the reference and predicted pK<sub>
<italic>a</italic>
</sub> values, the dashed lines mark the &#xb1;1 pk<sub>
<italic>a</italic>
</sub> unit interval. Mean absolute error (MAE) and root mean squared error (RMSE) are shown, the values in bracket indicate the 90% confidence interval calculated from 50 repetitions with random training/validation splits. <italic>N</italic> indicates the number of investigated samples.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S5</label>
<caption>
<p>The performance of the GNN model trained exclusively on the experimental data set is shown. 50 training runs with different training/validation splits were performed. To generate a single plot a randomly selected training run is shown. The solid red line in the scatter plot indicates the ideal behavior of the reference and predicted pK<sub>
<italic>a</italic>
</sub> values, the dashed lines mark the &#xb1;1 pk<sub>
<italic>a</italic>
</sub> unit interval. Mean absolute error (MAE) and root mean squared error (RMSE) are shown, the values in bracket indicate the 90% confidence interval calculated from 50 repetitions with random training/validation splits. <italic>N</italic> indicates the number of investigated samples.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S6</label>
<caption>
<p>The pk<sub>
<italic>a</italic>
</sub> distribution of ChEMBL and experimental data set are shown.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S7</label>
<caption>
<p>The performance of the fine-tuned GNN model on the ChEMBL data set is shown. In contrast to the results obtained with the fine-tuned models shown in <xref ref-type="sec" rid="s10">Supplementary Figure S4</xref> the models shown here did not use regularization. The performance of the GNN model decreased significantly on the ChEMBL data, shifting pK<sub>
<italic>a</italic>
</sub> values above 12 and below 2. The solid red line in the scatter plot indicates the ideal behavior of the reference and predicted pK<sub>
<italic>a</italic>
</sub> values, the dashed lines mark the &#xb1;1 pk<sub>
<italic>a</italic>
</sub> unit interval. Mean absolute error (MAE) and root mean squared error (RMSE) are shown, the values in bracket indicate the 90% confidence interval calculated from 50 repetitions with random training/validation splits. <italic>N</italic> indicates the number of investigated samples.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S8</label>
<caption>
<p>The distribution of molecular weight, the number of hetereoatoms, hydrogen bond acceptors (HBAs) and hydrogen bond donors (HBDs) and distribution of elements per molecule are shown for the ChEMBL data set.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S9</label>
<caption>
<p>Results are shown for a sequential pK<sub>
<italic>a</italic>
</sub> prediction using pkasolver-epic for ethylenediaminetetraacetic acid (EDTA). For each protonation state the base-acid pair is shown and the consensus prediction for the pK<sub>
<italic>a</italic>
</sub> value with the standard deviation is shown. The protonation site is highlighted for each protonation state.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S10</label>
<caption>
<p>Results are shown for a sequential pK<sub>
<italic>a</italic>
</sub> prediction using pkasolver-epic for lisdexamfetamine.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S11</label>
<caption>
<p>Results are shown for a sequential pK<sub>
<italic>a</italic>
</sub> prediction using pkasolver-epic for cocaine.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S12</label>
<caption>
<p>Results are shown for a sequential pK<sub>
<italic>a</italic>
</sub> prediction using pkasolver-epic for tyrosine.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S13</label>
<caption>
<p>Results are shown for a sequential pK<sub>
<italic>a</italic>
</sub> prediction using pkasolver-epic for taurine.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S14</label>
<caption>
<p>Results are shown for a sequential pK<sub>
<italic>a</italic>
</sub> prediction using pkasolver-epic for aspergillic acid.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S15</label>
<caption>
<p>Results are shown for a sequential pK<sub>
<italic>a</italic>
</sub> prediction using pkasolver-epic for ketamine.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S16</label>
<caption>
<p>Results are shown for a sequential pK<sub>
<italic>a</italic>
</sub> prediction using pkasolver-epic for levodopa.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S17</label>
<caption>
<p>Results are shown for a sequential pK<sub>
<italic>a</italic>
</sub> prediction using pkasolver-epic for furosemide.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S18</label>
<caption>
<p>Results are shown for a sequential pK<sub>
<italic>a</italic>
</sub> prediction using pkasolver-light for furosemide.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S19</label>
<caption>
<p>Results are shown for a sequential pK<sub>
<italic>a</italic>
</sub> prediction using pkasolver-epic for an aryl guanidine (SMILES: C1CNC(N1)&#x3d;NC1&#x3d;CC&#x3d;CN&#x3d;C1 ).</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Figure S20</label>
<caption>
<p>Results are shown for a sequential pK<sub>
<italic>a</italic>
</sub> prediction using pkasolver-epic for pyridine.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Table S1</label>
<caption>
<p>List of one-hot-encoding of atom features used for the node feature vector deposited in the node feature matrix <italic>X</italic>.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>Supplementary Table S2</label>
<caption>
<p>Experimental and calculated pK<sub>
<italic>a</italic>
</sub> values for the 24 compounds of the SAMPL6 pK<sub>
<italic>a</italic>
</sub> challenge (<xref ref-type="bibr" rid="B15">I&#x15f;&#x131;k et al., 2018</xref>). pK<sub>
<italic>a</italic>
</sub> values were calculated using pkasolver-epic. pK<sub>
<italic>a</italic>
</sub> values and standard distribution (shown in parenthesis) are rounded to one significant digit. The pK<sub>
<italic>a</italic>
</sub> value used to match the experimental pK<sub>
<italic>a</italic>
</sub> value is shown in red.</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="DataSheet1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<fn-group>
<fn id="fn10">
<label>1</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://github.com/samplchallenges/SAMPL8">https://github.com/samplchallenges/SAMPL8</ext-link>
</p>
</fn>
<fn id="fn11">
<label>2</label>
<p>version 12.01, Advanced Chemistry Development Inc. 2010ACD/Labs</p>
</fn>
<fn id="fn3">
<label>3</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://github.com/czodrowskilab/Machine-learning-meets-pKa">https://github.com/czodrowskilab/Machine-learning-meets-pKa</ext-link>
</p>
</fn>
<fn id="fn4">
<label>4</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html">https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html</ext-link>
</p>
</fn>
<fn id="fn5">
<label>5</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://www.vanderbilt.edu/AnS/Chemistry/Rizzo/stuff/AA/AminoAcids.html">https://www.vanderbilt.edu/AnS/Chemistry/Rizzo/stuff/AA/AminoAcids.html</ext-link>
</p>
</fn>
<fn id="fn6">
<label>6</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://www.sigmaaldrich.com/deepweb/assets/sigmaaldrich/product/documents/315/570/d9628pis.pdf">https://www.sigmaaldrich.com/deepweb/assets/sigmaaldrich/product/documents/315/570/d9628pis.pdf</ext-link>
</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Baltruschat</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Czodrowski</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). <source>Machine Learning Meets pKa</source>, <fpage>9</fpage>. <comment>[version 2; peer review: 2 approved]</comment>. </citation>
</ref>
<ref id="B2">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Baltruschat</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Czodrowski</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). <source>Machine Learning Meets pKa</source>, <fpage>9</fpage>. <comment>[version 2; peer review: 2 approved]</comment>. </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bergazin</surname>
<given-names>T. D.</given-names>
</name>
<name>
<surname>Tielker</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Mao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gunner</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Francisco</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Evaluation of Log P, pKa, and Log D Predictions from the SAMPL7 Blind Challenge</article-title>. <source>J. Comput. Aided Mol. Des.</source> <volume>35</volume>, <fpage>771</fpage>&#x2013;<lpage>802</lpage>. <pub-id pub-id-type="doi">10.1007/s10822-021-00397-3</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bisong</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>In: Building Machine Learning and Deep Learning Models on Google Cloud Platform Berkeley, CA: Apress</article-title>. <source>Google Colab.</source>, <fpage>59</fpage>&#x2013;<lpage>64</lpage>. <pub-id pub-id-type="doi">10.1007/978-1-4842-4470-8_7</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="book">
<collab>CRC Handbook</collab> (<year>2007</year>). <source>CRC Handbook of Chemistry and Physics</source>. <edition>88th Edition</edition>. <publisher-name>CRC Press</publisher-name>, <fpage>88</fpage>. <ext-link ext-link-type="uri" xlink:href="http://www.amazon.com/CRC-Handbook-Chemistry-Physics-88th/dp/0849304881/ref=sr_1_5?ie=UTF8&amp;qid=1302802093&amp;sr=8-5">http://www.amazon.com/CRC-Handbook-Chemistry-Physics-88th/dp/0849304881/ref&#x3d;sr_1_5?ie&#x3d;UTF8&#x26;qid&#x3d;1302802093&#x26;sr&#x3d;8-5</ext-link>. </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dardonville</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Caine</surname>
<given-names>B. A.</given-names>
</name>
<name>
<surname>Navarro De La Fuente</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Mart&#xed;n Herranz</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Corrales Mariblanca</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Popelier</surname>
<given-names>P. L. A.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Substituent Effects on the Basicity (pKa) of Aryl Guanidines and 2-(arylimino)imidazolidines: Correlations of pH-Metric and UV-Metric Values with Predictions from Gas-phase Ab Initio Bond Lengths</article-title>. <source>New J. Chem.</source> <volume>41</volume> (<issue>19</issue>), <fpage>11016</fpage>&#x2013;<lpage>11028</lpage>. <pub-id pub-id-type="doi">10.1039/c7nj02497e</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Davies</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Nowotka</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Papadatos</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Dedman</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Gaulton</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Atkinson</surname>
<given-names>F.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>ChEMBL Web Services: Streamlining Access to Drug Discovery Data and Utilities</article-title>. <source>Nucleic Acids Res.</source> <volume>43</volume> (<issue>W1</issue>), <fpage>W612</fpage>&#x2013;<lpage>W620</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkv352</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deng</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>XGraphBoost: Extracting Graph Neural Network-Based Features for a Better Prediction of Molecular Properties</article-title>. <source>J. Chem. Inf. Model.</source> <volume>61</volume> (<issue>6</issue>), <fpage>2697</fpage>&#x2013;<lpage>2705</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.0c01489</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Fey</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lenssen</surname>
<given-names>J. E.</given-names>
</name>
</person-group> (<year>2019</year>). <source>Fast Graph Representation Learning with PyTorch Geometric</source>. <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1903.02428,%20cite%20arxiv:1903.02428">http://arxiv.org/abs/1903.02428, cite arxiv:1903.02428</ext-link>. </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gaulton</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bellis</surname>
<given-names>L. J.</given-names>
</name>
<name>
<surname>Bento</surname>
<given-names>A. P.</given-names>
</name>
<name>
<surname>Chambers</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Davies</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hersey</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>ChEMBL: a Large-Scale Bioactivity Database for Drug Discovery</article-title>. <source>Nucleic Acids Res.</source> <volume>40</volume> (<issue>D1</issue>), <fpage>D1100</fpage>&#x2013;<lpage>D1107</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkr777</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gilmer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Schoenholz</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Riley</surname>
<given-names>P. F.</given-names>
</name>
<name>
<surname>Vinyals</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Dahl</surname>
<given-names>G. E.</given-names>
</name>
</person-group> (<year>2017</year>). <source>Neural Message Passing for Quantum Chemistry</source>. <pub-id pub-id-type="doi">10.1002/nme.2457</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Greenwood</surname>
<given-names>J. R.</given-names>
</name>
<name>
<surname>Calkins</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Sullivan</surname>
<given-names>A. P.</given-names>
</name>
<name>
<surname>Shelley</surname>
<given-names>J. C.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Towards the Comprehensive, Rapid, and Accurate Prediction of the Favorable Tautomeric States of Drug-like Molecules in Aqueous Solution</article-title>. <source>J. Comput. Aided Mol. Des.</source> <volume>24</volume> (<issue>6-7</issue>), <fpage>591</fpage>&#x2013;<lpage>604</lpage>. <pub-id pub-id-type="doi">10.1007/s10822-010-9349-1</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gunner</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Murakami</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Rustenburg</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>I&#x15f;&#x131;k</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chodera</surname>
<given-names>J. D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Standard State Free Energies, Not pKas, Are Ideal for Describing Small Molecule Protonation and Tautomeric States</article-title>. <source>J. Comput. Aided Mol. Des.</source> <volume>34</volume> (<issue>5</issue>), <fpage>561</fpage>&#x2013;<lpage>573</lpage>. <pub-id pub-id-type="doi">10.1007/s10822-020-00280-710.1007/s10822-020-00280-7</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Ioffe</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Szegedy</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2015</year>). <source>Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift</source>. <publisher-name>CoRR</publisher-name>. <comment>abs/1502.03167. http://arxiv.org/abs/1502.03167</comment>. </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>I&#x15f;&#x131;k</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Levorse</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Rustenburg</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Ndukwe</surname>
<given-names>I. E.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>pKa Measurements for the SAMPL6 Prediction Challenge for a Set of Kinase Inhibitor-like Fragments</article-title>. <source>J. Comput. Aided Mol. Des.</source> <volume>32</volume> (<issue>10</issue>), <fpage>1117</fpage>&#x2013;<lpage>1138</lpage>. <pub-id pub-id-type="doi">10.1007/s10822-018-0168-0</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>I&#x15f;&#x131;k</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Rustenburg</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Rizzi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gunner</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Mobley</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Chodera</surname>
<given-names>J. D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Overview of the SAMPL6 pKa Challenge: Evaluating Small Molecule Microscopic and Macroscopic pKa Predictions</article-title>. <source>J. Comput. Aided Mol. Des.</source> <volume>35</volume>, <fpage>131</fpage>&#x2013;<lpage>166</lpage>. <comment>Springer International Publishing</comment>. <pub-id pub-id-type="doi">10.1007/s10822-020-00362-6</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Hsieh</surname>
<given-names>C.-Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Could Graph Neural Networks Learn Better Molecular Representation for Drug Discovery? A Comparison Study of Descriptor-Based and Graph-Based Models</article-title>. <source>J. Cheminform</source> <volume>13</volume> (<issue>1</issue>), <fpage>1</fpage>&#x2013;<lpage>23</lpage>. <pub-id pub-id-type="doi">10.1186/s13321-020-00479-8</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Latscha</surname>
<given-names>H. P.</given-names>
</name>
<name>
<surname>Klein</surname>
<given-names>H. A.</given-names>
</name>
<name>
<surname>Linti</surname>
<given-names>G. W.</given-names>
</name>
</person-group> (<year>2004</year>). <source>Analytische Chemie: Chemie-Basiswissen III. Chemie-Basiswissen</source>. <publisher-name>Springer</publisher-name>. <ext-link ext-link-type="uri" xlink:href="https://books.google.pt/books?id=xVJ0WtmKMHQC">https://books.google.pt/books?id&#x3d;xVJ0WtmKMHQC</ext-link>. </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Nicklaus</surname>
<given-names>M. C.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Comparison of Nine Programs Predicting pKa Values of Pharmaceutical Substances</article-title>. <source>J. Chem. Inf. Model.</source> <volume>49</volume> (<issue>12</issue>), <fpage>2801</fpage>&#x2013;<lpage>2812</lpage>. <pub-id pub-id-type="doi">10.1021/ci900289x</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Nicklaus</surname>
<given-names>M. C.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Comparison of Nine Programs Predicting pKa Values of Pharmaceutical Substances</article-title>. <source>J. Chem. Inf. Model.</source> <volume>49</volume> (<issue>12</issue>), <fpage>2801</fpage>&#x2013;<lpage>2812</lpage>. <comment>Dec</comment>. <pub-id pub-id-type="doi">10.1021/ci900289x</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Loshchilov</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Hutter</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2019</year>). <source>Decoupled Weight Decay Regularization. 7th International Conference on Learning Representations</source>. <publisher-name>ICLR 2019</publisher-name>. </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Manallack</surname>
<given-names>D. T.</given-names>
</name>
<name>
<surname>Prankerd</surname>
<given-names>R. J.</given-names>
</name>
<name>
<surname>Yuriev</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Oprea</surname>
<given-names>T. I.</given-names>
</name>
<name>
<surname>Chalmers</surname>
<given-names>D. K.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>The Significance of Acid/base Properties in Drug Discovery</article-title>. <source>Chem. Soc. Rev.</source> <volume>42</volume> (<issue>2</issue>), <fpage>485</fpage>&#x2013;<lpage>496</lpage>. <pub-id pub-id-type="doi">10.1039/C2CS35348B</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Manallack</surname>
<given-names>D. T.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>The pKa Distribution of Drugs: Application to Drug Discovery</article-title>. <source>Perspect. Med. Chem.</source>, <volume>1</volume>, <fpage>1177391X0700100</fpage>. <pub-id pub-id-type="doi">10.1177/1177391X0700100003</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mansouri</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Cariello</surname>
<given-names>N. F.</given-names>
</name>
<name>
<surname>Korotcov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Tkachenko</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Grulke</surname>
<given-names>C. M.</given-names>
</name>
<name>
<surname>Sprankle</surname>
<given-names>C. S.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Open-source QSAR Models for pKa Prediction Using Multiple Machine Learning Approaches</article-title>. <source>J. Cheminform</source> <volume>11</volume> (<issue>1</issue>), <fpage>1</fpage>&#x2013;<lpage>20</lpage>. <pub-id pub-id-type="doi">10.1186/s13321-019-0384-1</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>McNaught</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Wilkinson</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2014</year>). <source>Of Pure IU, Chemistry A, of Chemistry (Great Britain) RS. IUPAC Compendium of Chemical Terminology</source>. <publisher-name>International Union of Pure and Applied Chemistry</publisher-name>. <ext-link ext-link-type="uri" xlink:href="https://books.google.at/books?id=l2LojwEACAAJ">https://books.google.at/books?id&#x3d;l2LojwEACAAJ</ext-link>. </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mech</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Bogunia</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Nowacki</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Makowski</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Calculations of pKa Values of Selected Pyridinium and its N-Oxide Ions in Water and Acetonitrile</article-title>. <source>J. Phys. Chem. A</source> <volume>124</volume> (<issue>3</issue>), <fpage>538</fpage>&#x2013;<lpage>551</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jpca.9b10319</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="book">
<collab>National Center for Biotechnology Information</collab> (<year>2022a</year>). <source>PubChem Compound Summary for CID 3440</source>. <publisher-name>Lisdexamfetamine</publisher-name>. <ext-link ext-link-type="uri" xlink:href="https://pubchem.ncbi.nlm.nih.gov/compound/Lisdexamfetamine">https://pubchem.ncbi.nlm.nih.gov/compound/Lisdexamfetamine</ext-link>. </citation>
</ref>
<ref id="B28">
<citation citation-type="book">
<collab>National Center for Biotechnology Information</collab> (<year>2022b</year>). <source>PubChem Compound Summary for CID 3440</source>. <publisher-name>Cocaine</publisher-name>. <ext-link ext-link-type="uri" xlink:href="https://pubchem.ncbi.nlm.nih.gov/compound/Cocaine">https://pubchem.ncbi.nlm.nih.gov/compound/Cocaine</ext-link>. </citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<collab>National Center for Biotechnology Information</collab> (<year>2022c</year>). <source>PubChem Compound Summary for CID 3440</source>. <publisher-name>Furosemide</publisher-name>. <ext-link ext-link-type="uri" xlink:href="https://pubchem.ncbi.nlm.nih.gov/compound/Furosemide">https://pubchem.ncbi.nlm.nih.gov/compound/Furosemide</ext-link>. </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J. Z. H.</given-names>
</name>
<name>
<surname>Ji</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>MolGpka: A Web Server for Small Molecule pKa Prediction Using a Graph-Convolutional Neural Network</article-title>. <source>J. Chem. Inf. Model.</source> <volume>61</volume> (<issue>7</issue>), <fpage>3159</fpage>&#x2013;<lpage>3165</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.1c00075</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Paszke</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gross</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Massa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Lerer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bradbury</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chanan</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>PyTorch: An Imperative Style, High-Performance Deep Learning Library</article-title>,&#x201d; in <source>Advances in Neural Information Processing Systems 32</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Wallach</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Larochelle</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Beygelzimer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>d&#x27;Alch&#xe9;-Buc</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Garnett</surname>
<given-names>R.</given-names>
</name>
</person-group> (<publisher-name>Curran Associates, Inc.</publisher-name>), <fpage>8024</fpage>&#x2013;<lpage>8035</lpage>. <ext-link ext-link-type="uri" xlink:href="http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf">http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf</ext-link>. </citation>
</ref>
<ref id="B32">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Perrin</surname>
<given-names>D. D.</given-names>
</name>
<name>
<surname>Dempsey</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Serjeant</surname>
<given-names>E. P.</given-names>
</name>
</person-group> (<year>1981</year>). <source>pK a Prediction for Organic Acids and Bases</source>. <publisher-loc>Dordrecht</publisher-loc>: <publisher-name>Springer Netherlands</publisher-name>. <pub-id pub-id-type="doi">10.1007/978-94-009-5883-8</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Prasad</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Brooks</surname>
<given-names>B. R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>An Explicit-Solvent Hybrid QM and MM Approach for Predicting pKa of Small Molecules in SAMPL6 Challenge</article-title>. <source>J. Comput. Aided Mol. Des.</source> <volume>32</volume> (<issue>10</issue>), <fpage>1191</fpage>&#x2013;<lpage>1201</lpage>. <pub-id pub-id-type="doi">10.1007/s10822-018-0167-1</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="book">
<collab>RDKit, Open-Source Chemiformatics</collab> (<year>2022</year>). <source>RDKit, Open-Source Chemiformatics</source>. <ext-link ext-link-type="uri" xlink:href="http://www.rdkit.org">http://www.rdkit.org</ext-link>. </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rogers</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Hahn</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Extended-connectivity Fingerprints</article-title>. <source>J. Chem. Inf. Model.</source> <volume>50</volume> (<issue>5</issue>), <fpage>742</fpage>&#x2013;<lpage>754</lpage>. <comment>may</comment>. <pub-id pub-id-type="doi">10.1021/ci100050t</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ropp</surname>
<given-names>P. J.</given-names>
</name>
<name>
<surname>Kaminsky</surname>
<given-names>J. C.</given-names>
</name>
<name>
<surname>Yablonski</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Durrant</surname>
<given-names>J. D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Dimorphite-DL: An Open-Source Program for Enumerating the Ionization States of Drug-like Small Molecules</article-title>. <source>J. Cheminform</source> <volume>11</volume> (<issue>1</issue>), <fpage>1</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1186/s13321-019-0336-910.1186/s13321-019-0336-9</pub-id> </citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rupp</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Korner</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>V. Tetko</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Predicting the pKa of Small Molecules</article-title>. <source>Cchts</source> <volume>14</volume> (<issue>5</issue>), <fpage>307</fpage>&#x2013;<lpage>327</lpage>. <pub-id pub-id-type="doi">10.2174/138620711795508403</pub-id> </citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Selwa</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Kenney</surname>
<given-names>I. M.</given-names>
</name>
<name>
<surname>Beckstein</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Iorga</surname>
<given-names>B. I.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>SAMPL6: Calculation of Macroscopic pKa Values from Ab Initio Quantum Mechanical Free Energies</article-title>. <source>J. Comput. Aided Mol. Des.</source> <volume>32</volume> (<issue>10</issue>), <fpage>1203</fpage>&#x2013;<lpage>1216</lpage>. <pub-id pub-id-type="doi">10.1007/s10822-018-0138-6</pub-id> </citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shelley</surname>
<given-names>J. C.</given-names>
</name>
<name>
<surname>Cholleti</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Frye</surname>
<given-names>L. L.</given-names>
</name>
<name>
<surname>Greenwood</surname>
<given-names>J. R.</given-names>
</name>
<name>
<surname>Timlin</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Uchimaya</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Epik: a Software Program for pK a Prediction and Protonation State Generation for Drug-like Molecules</article-title>. <source>J. Comput. Aided Mol. Des.</source> <volume>21</volume> (<issue>12</issue>), <fpage>681</fpage>&#x2013;<lpage>691</lpage>. <pub-id pub-id-type="doi">10.1007/s10822-007-9133-z</pub-id> </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tielker</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Eberlein</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>G&#xfc;ssregen</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kast</surname>
<given-names>S. M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>The SAMPL6 Challenge on Predicting Aqueous pKa Values from EC-RISM Theory</article-title>. <source>J. Comput. Aided Mol. Des.</source> <volume>32</volume> (<issue>10</issue>), <fpage>1151</fpage>&#x2013;<lpage>1163</lpage>. <pub-id pub-id-type="doi">10.1007/s10822-018-0140-z</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wieder</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Kohlbacher</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kuenemann</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Garon</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ducrot</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Seidel</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>A Compact Review of Molecular Property Prediction with Graph Neural Networks</article-title>. <source>Drug Discov. Today Technol</source>
<comment>. <ext-link ext-link-type="uri" xlink:href="https://www.sciencedirect.com/science/article/pii/S174067492030030">https://www.sciencedirect.com/science/article/pii/S174067492030030</ext-link>5</comment>. <pub-id pub-id-type="doi">10.1016/j.ddtec.2020.11.009</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Long</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>P. S.</given-names>
</name>
</person-group> (<year>2019</year>). <source>A Comprehensive Survey on Graph Neural Networks</source>. <comment>jan</comment>. </citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Jegelka</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Leskovec</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>How Powerful Are Graph Neural Networks? 7th International Conference on Learning Representations</article-title>. <source>ICLR</source> <volume>2019</volume>, <fpage>1</fpage>&#x2013;<lpage>17</lpage>. </citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J. D.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Holistic Prediction of the P K a in Diverse Solvents Based on a Machine&#x2010;Learning Approach</article-title>. <source>Angew. Chem. Int. Ed.</source> <volume>59</volume> (<issue>43</issue>), <fpage>19282</fpage>&#x2013;<lpage>19291</lpage>. <pub-id pub-id-type="doi">10.1002/anie.202008528</pub-id> </citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zeng</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Jones</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Brooks</surname>
<given-names>B. R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Absolute and Relative pKa Predictions via a DFT Approach Applied to the SAMPL6 Blind Challenge</article-title>. <source>J. Comput. Aided Mol. Des.</source> <volume>32</volume> (<issue>10</issue>), <fpage>1179</fpage>&#x2013;<lpage>1189</lpage>. <pub-id pub-id-type="doi">10.1007/s10822-018-0150-x</pub-id> </citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Graph Neural Networks: A Review of Methods and Applications</article-title>. <source>AI Open</source> <volume>1</volume> (<issue>September 2020</issue>), <fpage>57</fpage>&#x2013;<lpage>81</lpage>. <pub-id pub-id-type="doi">10.1016/j.aiopen.2021.01.001</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>