<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">887643</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2022.887643</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>A General-Purpose Machine Learning R Library for Sparse Kernels Methods With an Application for Genome-Based Prediction</article-title>
<alt-title alt-title-type="left-running-head">Montesinos L&#xf3;pez et al.</alt-title>
<alt-title alt-title-type="right-running-head">An R Library for Sparse Kernel Methods for Genomic Prediction</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Montesinos L&#xf3;pez</surname>
<given-names>Osval Antonio</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/988922/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Mosqueda Gonz&#xe1;lez</surname>
<given-names>Brandon Alejandro</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Palafox Gonz&#xe1;lez</surname>
<given-names>Abel</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Montesinos L&#xf3;pez</surname>
<given-names>Abelardo</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Crossa</surname>
<given-names>Jos&#xe9;</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<xref ref-type="fn" rid="fn1">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/50360/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Facultad de Telem&#xe1;tica</institution>, <institution>Universidad de Colima</institution>, <addr-line>Colima</addr-line>, <country>Mexico</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Centro de Investigaci&#xf3;n en Computaci&#xf3;n (CIC)</institution>, <institution>Instituto Polit&#xe9;cnico Nacional (IPN)</institution>, <addr-line>M&#xe9;xico City</addr-line>, <country>Mexico</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Centro Universitario de Ciencias Exactas e Ingenier&#xed;as (CUCEI)</institution>, <institution>Universidad de Guadalajara</institution>, <addr-line>Guadalajara</addr-line>, <country>Mexico</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>International Maize and Wheat Improvement Center (CIMMYT)</institution>, <addr-line>Texcoco</addr-line>, <country>Mexico</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Colegio de Postgraduados</institution>, <addr-line>Montecillo</addr-line>, <country>Mexico</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/154607/overview">Ravi Valluru</ext-link>, University of Lincoln, United Kingdom</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/738478/overview">Alencar Xavier</ext-link>, Corteva Agriscience&#x2122;, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/823352/overview">Moyses Nascimento</ext-link>, Universidade Federal de Vi&#xe7;osa, Brazil</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Abelardo Montesinos L&#xf3;pez, <email>aml_uach2004@hotmail.com</email>; Jos&#xe9; Crossa, <email>j.crossa@cgiar.org</email>
</corresp>
<fn fn-type="other" id="fn1">
<label>
<sup>&#x2020;</sup>
</label>
<p>ORCID: Jos&#xe9; Crossa, <ext-link ext-link-type="uri" xlink:href="http://orcid.org/0000-0001-9429-5855">orcid.org/0000-0001-9429-5855</ext-link>
</p>
</fn>
<fn fn-type="other">
<p>This article was submitted to Plant Genomics, a section of the journal Frontiers in Genetics</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>03</day>
<month>06</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>13</volume>
<elocation-id>887643</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>03</month>
<year>2022</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>05</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Montesinos L&#xf3;pez, Mosqueda Gonz&#xe1;lez, Palafox Gonz&#xe1;lez, Montesinos L&#xf3;pez and Crossa.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Montesinos L&#xf3;pez, Mosqueda Gonz&#xe1;lez, Palafox Gonz&#xe1;lez, Montesinos L&#xf3;pez and Crossa</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>The adoption of machine learning frameworks in areas beyond computer science have been facilitated by the development of user-friendly software tools that do not require an advanced understanding of computer programming. In this paper, we present a new package (sparse kernel methods, SKM) software developed in R language for implementing six (generalized boosted machines, generalized linear models, support vector machines, random forest, Bayesian regression models and deep neural networks) of the most popular supervised machine learning algorithms with the optional use of sparse kernels. The SKM focuses on user simplicity, as it does not try to include all the available machine learning algorithms, but rather the most important aspects of these six algorithms in an easy-to-understand format. Another relevant contribution of this package is a function for the computation of seven different kernels. These are Linear, Polynomial, Sigmoid, Gaussian, Exponential, Arc-Cosine 1 and Arc-Cosine L (with L &#x3d; 2, 3, &#x2026; ) and their sparse versions, which allow users to create kernel machines without modifying the statistical machine learning algorithm. It is important to point out that the main contribution of our package resides in the functionality for the computation of the sparse version of seven basic kernels, which is indispensable for reducing computational resources to implement kernel machine learning methods without a significant loss in prediction performance. Performance of the SKM is evaluated in a genome-based prediction framework using both a maize and wheat data set. As such, the use of this package is not restricted to genome prediction problems, and can be used in many different applications.</p>
</abstract>
<kwd-group>
<kwd>r package</kwd>
<kwd>machine learning</kwd>
<kwd>kernel</kwd>
<kwd>supervised learning</kwd>
<kwd>sparse kernels</kwd>
<kwd>genome-base prediction</kwd>
</kwd-group>
<contract-sponsor id="cn001">Bill and Melinda Gates Foundation<named-content content-type="fundref-id">10.13039/100000865</named-content>
</contract-sponsor>
<contract-sponsor id="cn002">United States Agency for International Development<named-content content-type="fundref-id">10.13039/100000200</named-content>
</contract-sponsor>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>Machine learning has become the main approach for solving complex, data-based problems and it is being used everywhere from devices and digital services such as smartphones and websites, to scientific research in various fields (<xref ref-type="bibr" rid="B30">Wang et al., 2016</xref>; <xref ref-type="bibr" rid="B22">Ott et al., 2020</xref>; <xref ref-type="bibr" rid="B28">Shahin et al., 2020</xref>; <xref ref-type="bibr" rid="B17">Montesinos-L&#xf3;pez et al., 2021a</xref>). As machine learning research has progressed, so has the supply and demand of software that facilitates its implementation. For this reason, numerous open-source packages for data related tasks and machine learning algorithms have become even more prevalent (<xref ref-type="bibr" rid="B1">Abadi et al., 2015</xref>; <xref ref-type="bibr" rid="B32">Wickham et al., 2015</xref>; <xref ref-type="bibr" rid="B23">Pandas development team, 2020</xref>).</p>
<p>One of the most used programming languages for data analysis is R (<xref ref-type="bibr" rid="B26">R Core Team, 2021</xref>) due to its statistical computing focus, free and open-source software and the thousands of packages that extend its power to all kind of analysis and related tasks of data science. In fact, it is difficult to find a machine learning algorithm not implemented within an R package. Likewise, it can even be said that some of the R packages contain more complete/specialized implementations (<xref ref-type="bibr" rid="B10">Ishwaran et al., 2008</xref>; <xref ref-type="bibr" rid="B8">Friedman et al., 2010</xref>; <xref ref-type="bibr" rid="B15">Meyer et al., 2019</xref>) than those available in other programming languages. As machine learning is strongly based on statistical models and R is the <italic>de facto</italic> language for statistics research, those who embark on machine learning will encounter R at some point.</p>
<p>Most R packages of machine learning algorithms include one type of model or a family of similar models. While using R packages have clear advantages, there are some challenges. For example, each package has been developed by different authors and there is no standardized code style guideline. This complicates the use of packages since it requires users to learn the expected data format, the name and expected parameters and the code convention (if any) in order to train a model or retrieve outputs. In addition, several complementing packages may be needed to perform cross validation of models, hyperparameter tuning, and compute accuracy metrics, among others. There are some libraries that seek to integrate a wide range of tools needed for machine learning in one place, such as scikit-learn (<xref ref-type="bibr" rid="B24">Pedregosa et al., 2011</xref>) in Python; H2O in Java (with both R and Python versions); and caret (<xref ref-type="bibr" rid="B11">Kuhn, 2016</xref>), mlr3 (<xref ref-type="bibr" rid="B13">Lang et al., 2019</xref>) and tidy models (<xref ref-type="bibr" rid="B12">Kuhn and Wickham, 2020</xref>) in R. All these options have their own philosophy, and they were designed using diverse approaches to implement machine learning models.</p>
<p>We consider the mlr3 as the most powerful R package for machine learning because of its potential scope. The mlr3 package is an object-oriented solution for machine learning focused on extensibility since it does not implement any model itself, but rather provide a unified interface for many existing packages in R. While this is a major advantage, such an approach does not completely solve the dependency of other packages, which require knowledge of both the package that implements the model and mlr3. It is worthwhile to learn how to use all the components in the mlr3 environment because it also provides efficient implementation of most data related tasks, parallelization, hyperparameter tuning and feature selection, among others. However, it takes times getting accustomed to the way mlr3 works and how things are defined in parts with the object-oriented paradigm, which is not so common in R programming. Nevertheless, this learning curve is relatively short.</p>
<p>Alternatively, we have caret and tidy models providing their own standardized interface, which is a very important factor in a good quality software. Like mlr3, these two packages use other third party packages of machine learning algorithms in tandem to train models as they provide with different options for the same algorithm. Caret is the oldest of these three packages, and as such, it still enjoys considerable popularity. Notwithstanding, the major advantage of tidy models is that they belong to the tidy verse, a collection of R packages tailored for data science that share an underlying design philosophy, grammar and data structures (<xref ref-type="bibr" rid="B31">Wickham et al., 2019</xref>); consequently, if users are familiar with tidy verse packages, they will naturally start using tidy models.</p>
<p>In the current paper, we present SKM (Sparse Kernels Methods), a new R package for machine learning that includes functions for model training, tuning, prediction, metrics evaluation and sparse kernels computation. The main goal of this package is to provide a stand-alone (or self-contained) R software, focused on the austere implementation of only six basic supervised learning models that are easy to understand from the user&#xb4;s point-of- view. We will focus specifically on six types of supervised models, which are explained in the next section. The model functions in SKM were designed with simplicity in mind, and as such, the parameters, hyperparameters and tuning specifications are defined directly when calling the function; subsequently, users can understand how the package works by observing a handful of examples. Furthermore, we strive to provide clear documentation following a base convention in the functions. Likewise, all the parameters are validated with checkmate software (<xref ref-type="bibr" rid="B14">Lang, 2017</xref>) to inform the user when an error occurs through meaningful error messages&#x2014;something that many other packages neglect. The most important hyperparameters of each model can be tuned with two different methods: grid search and Bayesian optimization (<xref ref-type="bibr" rid="B21">Osborne et al., 2009</xref>) based on the code of Bayesian Optimization package (<xref ref-type="bibr" rid="B34">Yan, 2016</xref>). Although Bayesian optimization is a very popular and effective method of tuning, the mlr3 and caret packages do not offer this option.</p>
<p>Kernels have proven to be useful in helping the conventional machine learning algorithms capture non-linear patterns in data (<xref ref-type="bibr" rid="B18">Montesinos-L&#xf3;pez et al., 2021b</xref>; <xref ref-type="bibr" rid="B20">Montesinos-L&#xf3;pez et al., 2022a</xref>). In addition to capturing complex non-linear patterns, the sparse kernel version of kernel methods can also save significant computational resources without a relevant loss in prediction accuracy (<xref ref-type="bibr" rid="B18">Montesinos-L&#xf3;pez et al., 2021b</xref>; <xref ref-type="bibr" rid="B20">Montesinos-L&#xf3;pez, et al., 2022a</xref>). In this paper by sparse kernels we define those kernels that are built with only a fraction of the total amount of inputs by assuming that the input matrix is a sparse matrix, that is, a matrix that contain many information with zeros. For this reason, the term level of compression, here is used, as one minus the proportion of the total lines (or rows) used to compute the sparse kernels thus representing the level of dimensionality reduction reached by using these sparse kernels. To the best of our knowledge, there is no existing R package for the computation of dense kernels and sparse kernels (that compress the dimension of the dense kernels), which is the added value of SKM and what gives it its name. The approach of sparse kernels implemented in the SKM library is based on the method proposed in <xref ref-type="bibr" rid="B6">Cuevas et al. (2020)</xref>.</p>
<p>As software developers and consumers, we are aware of the importance of sharing our work with the community, and as such, SKM is a completely open-source software released under the GNU Lesser General Public License v3.0 (LGPLv3). As such, anyone can explore the source code, make modifications and build on it to develop other tools.</p>
</sec>
<sec id="s2">
<title>Machine Learning Algorithms</title>
<p>The SKM package includes six different functions of supervised machine learning algorithms. <xref ref-type="table" rid="T1">Table 1</xref> shows the six models that can be implemented under the SKM package, and the package of origin that each of these models uses, in addition to the function to implement these models in the SKM library.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Models that can be implemented in the SKM library.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Model</th>
<th align="center">Name</th>
<th align="center">Package of origin</th>
<th align="center">Function in SKM</th>
<th align="center">Response variables</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="2" align="left">M1</td>
<td rowspan="2" align="left">Generalized boosted machines</td>
<td rowspan="2" align="left">gbm (<xref ref-type="bibr" rid="B9">Greenwell et al., 2020</xref>)</td>
<td align="left">generalized_</td>
<td rowspan="2" align="left">Binary, categorical and continuous; only univariate</td>
</tr>
<tr>
<td align="left">boosted_machine ()</td>
</tr>
<tr>
<td rowspan="2" align="left">M2</td>
<td rowspan="2" align="left">Generalized linear models</td>
<td rowspan="2" align="left">Glmnet (<xref ref-type="bibr" rid="B8">Friedman et al., 2010</xref>)</td>
<td align="left">Generalized</td>
<td rowspan="2" align="left">Binary, categorical, continuous, and count; univariate and multivariate only for continuous response variables</td>
</tr>
<tr>
<td align="left">_linear_model ( )</td>
</tr>
<tr>
<td rowspan="2" align="left">M3</td>
<td rowspan="2" align="left">Support vector machines</td>
<td rowspan="2" align="left">e1071 (<xref ref-type="bibr" rid="B15">Meyer, et al., 2019</xref>)</td>
<td align="left">Support</td>
<td rowspan="2" align="left">Binary, categorical and continuous, only for univariate response variables</td>
</tr>
<tr>
<td align="left">_vector_machine ()</td>
</tr>
<tr>
<td align="left">M4</td>
<td align="left">Random forest</td>
<td align="left">RandomForestSRC (<xref ref-type="bibr" rid="B10">Ishwaran, et al., 2008</xref>)</td>
<td align="left">random_forest ()</td>
<td align="left">Binary, categorical and continuous, univariate and multivariate</td>
</tr>
<tr>
<td align="left">M5</td>
<td align="left">Bayesian regression models</td>
<td align="left">BGLR (<xref ref-type="bibr" rid="B25">Perez and de los Campos, 2014</xref>)</td>
<td align="left">bayesian_model ()</td>
<td align="left">Binary, categorical and continuous, univariate and multivariate only for continuous response variables</td>
</tr>
<tr>
<td align="left">M6</td>
<td align="left">Deep neural networks</td>
<td align="left">keras (<xref ref-type="bibr" rid="B2">Allaire and Chollet, 2016</xref>)</td>
<td align="left">deep_learning ()</td>
<td align="left">Binary, categorical, continuous, and count; univariate and multivariate for all response variables</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>It is important to point out that all models that can be implemented in the SKM library will be able to implement the seven kernels methods and its sparse versions explained in the next section, whereas in the case of deep neural networks (M6), only fully connected networks can be implemented. Under the Bayesian methods, Bayesian Ridge regression (BRR), Bayes A (Bayes_A), Bayes B (Bayes_B), Bayes C (Bayes_C), Bayesian Lasso (Bayes_Lasso) and the best linear unbiased predictor (GBLUP) in its Bayesian version (BGBLUP) can be implanted. It should be highlighted that the six models that can be implemented in the SKM library, including all the Bayesian methods available in model M5, can also work with kernels. First, the matrix of inputs (X) is created; then the square root of the kernel is computed; next the design matrix of lines is post-multiplied by the square root of the kernel; and finally this design matrix is used as input in any of the six models when kernels are used. The exception is under the BGBLUP in model M5, where the computed kernels are directly used.</p>
<p>The additional layer of abstraction allows all functions to share the same data input format. Internally, data is adapted to the expected format of each package, where the result and prediction objects returned by these functions are also in the same format. Another benefit of these functions is that some parameters that can be inferred from data itself do not need to be supplied by the user, rather they are set automatically. For example, the family parameter of glmnet package which has to be &#x201c;Gaussian&#x201d; for continuous response variables, &#x201c;binomial&#x201d; for binary variables, &#x201c;multinomial&#x201d; for categorical response variables and &#x201c;Poisson&#x201d; for count variables, can be inferred from the response variable. In addition, the same functions permit hyperparameter tuning in an easy and user-friendly format without the need to call another function or initiate another object. In theory, as with all packages that internally call functions of other packages, ease of use and extended functionality is expected to improve with a slight increase in computational demand for the extra operations required. Furthermore, since these operations are of computationally low cost, there is no significant loss of power.</p>
<p>
<xref ref-type="sec" rid="s15">Supplementary Appendix SA</xref> included some comparative examples of the equivalent implementation of some machine learning models with mlr3, SKM and randomForestSRC, the original package.</p>
</sec>
<sec id="s3">
<title>Sparse Kernels</title>
<p>As <xref ref-type="bibr" rid="B18">Montesinos-L&#xf3;pez et al. (2021b)</xref> point out, kernel methods transform the independent variables (inputs) using a kernel function, followed by the application of conventional machine learning techniques to the transformed data to achieve better results, mainly when the inputs contain non-linear patterns. Kernel methods are excellent options in terms of computational efficiency when managing large, complex data that show non-linear patterns; likewise they can be used with any type of predictive machine. Consequently, we have included the kernelize function in SKM that can compute the same 7 kernels and their sparse versions as described in <xref ref-type="bibr" rid="B18">Montesinos-L&#xf3;pez et al. (2021b)</xref>: Linear, Polynomial, Sigmoid, Gaussian, Exponential, Arc-Cosine 1 and Arc-Cosine L (with L &#x3d; 2, 3, &#x2026; ). The kernel computation is independent from the model fitting process, which allows the kernelize function to be used with other packages or conversely, the machine learning algorithms implementation of SKM can be used without kernels.</p>
<p>Next the algorithm to approximate the kernels, here called sparse kernels is described in general terms. We assume that the response variable <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is associated to the genomic effects <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>u</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> as:<disp-formula id="e1">
<mml:math id="m3">
<mml:mrow>
<mml:mi mathvariant="bold-italic">y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="bold-italic">u</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="bold-italic">e</mml:mi>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <inline-formula id="inf3">
<mml:math id="m4">
<mml:mi>&#x3bc;</mml:mi>
</mml:math>
</inline-formula> is the overall mean, <bold>1</bold> is the vector of ones, and <inline-formula id="inf4">
<mml:math id="m5">
<mml:mi mathvariant="bold-italic">y</mml:mi>
</mml:math>
</inline-formula> is the vector of size <inline-formula id="inf5">
<mml:math id="m6">
<mml:mi>n</mml:mi>
</mml:math>
</inline-formula>. Moreover, <inline-formula id="inf6">
<mml:math id="m7">
<mml:mi mathvariant="bold-italic">u</mml:mi>
</mml:math>
</inline-formula> is the vector of genomic effects <inline-formula id="inf7">
<mml:math id="m8">
<mml:mrow>
<mml:mi mathvariant="bold-italic">u</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>u</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mi mathvariant="bold-italic">K</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf8">
<mml:math id="m9">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>u</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> is the genomic variance component and matrix <inline-formula id="inf9">
<mml:math id="m10">
<mml:mi mathvariant="bold-italic">K</mml:mi>
</mml:math>
</inline-formula> is the dense kernel of order <inline-formula id="inf10">
<mml:math id="m11">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> constructed with any of the kernel methods explained above. The random residuals are assumed independent with normal distribution <inline-formula id="inf11">
<mml:math id="m12">
<mml:mrow>
<mml:mi mathvariant="bold-italic">e</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>e</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf12">
<mml:math id="m13">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>e</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> is the error variance. The dense kernel, <inline-formula id="inf13">
<mml:math id="m14">
<mml:mi mathvariant="bold-italic">K</mml:mi>
</mml:math>
</inline-formula>, can be approximated as <inline-formula id="inf14">
<mml:math id="m15">
<mml:mrow>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mo>&#x2248;</mml:mo>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B33">Williams and Seeger, 2001</xref>), where <inline-formula id="inf15">
<mml:math id="m16">
<mml:mi mathvariant="bold-italic">Q</mml:mi>
</mml:math>
</inline-formula> will have the rank of <inline-formula id="inf16">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, that is, <inline-formula id="inf17">
<mml:math id="m18">
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula>. The computation of this kernel is facilitated since it is not necessary to compute and store the original matrix <inline-formula id="inf18">
<mml:math id="m19">
<mml:mi mathvariant="bold-italic">K</mml:mi>
</mml:math>
</inline-formula>, since only <inline-formula id="inf19">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf20">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are required. This approximation of the dense kernel (which we call sparse kernel) use <inline-formula id="inf21">
<mml:math id="m22">
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula> out of <inline-formula id="inf22">
<mml:math id="m23">
<mml:mi>n</mml:mi>
</mml:math>
</inline-formula> lines to compute <inline-formula id="inf23">
<mml:math id="m24">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, then an eigen-value-decomposition of <inline-formula id="inf24">
<mml:math id="m25">
<mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="bold-italic">U</mml:mi>
<mml:msup>
<mml:mi mathvariant="bold-italic">S</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mi mathvariant="bold-italic">S</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mi mathvariant="bold-italic">U</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is used, where <inline-formula id="inf25">
<mml:math id="m26">
<mml:mi mathvariant="bold-italic">U</mml:mi>
</mml:math>
</inline-formula> are the eigen vectors of order <inline-formula id="inf26">
<mml:math id="m27">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf27">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a diagonal matrix of order <inline-formula id="inf28">
<mml:math id="m29">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> with the eigen values ordered from largest to smallest. Next, these values are substituted in <inline-formula id="inf29">
<mml:math id="m30">
<mml:mrow>
<mml:mi mathvariant="bold-italic">Q</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi mathvariant="bold-italic">U</mml:mi>
<mml:msup>
<mml:mi mathvariant="bold-italic">S</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mi mathvariant="bold-italic">S</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mi mathvariant="bold-italic">U</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> resulting in <inline-formula id="inf30">
<mml:math id="m31">
<mml:mrow>
<mml:mi mathvariant="bold-italic">u</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>u</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi mathvariant="bold-italic">U</mml:mi>
<mml:msup>
<mml:mi mathvariant="bold-italic">S</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mi mathvariant="bold-italic">S</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mi mathvariant="bold-italic">U</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, and thus, model (1) can be expressed as:<disp-formula id="e2">
<mml:math id="m32">
<mml:mrow>
<mml:mi mathvariant="bold-italic">y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#xb5;</mml:mo>
<mml:msub>
<mml:mn>1</mml:mn>
<mml:mi mathvariant="bold-italic">n</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="bold-italic">P</mml:mi>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3b5;</mml:mi>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>Model (2) is similar to model (1), except that <inline-formula id="inf31">
<mml:math id="m33">
<mml:mi mathvariant="bold-italic">f</mml:mi>
</mml:math>
</inline-formula> is a vector of order <inline-formula id="inf32">
<mml:math id="m34">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> with a normal distribution of the form <inline-formula id="inf33">
<mml:math id="m35">
<mml:mrow>
<mml:mi mathvariant="bold-italic">f</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi>f</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:msub>
<mml:mi mathvariant="bold-italic">I</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf34">
<mml:math id="m36">
<mml:mrow>
<mml:mi mathvariant="bold-italic">P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi mathvariant="bold-italic">U</mml:mi>
<mml:msup>
<mml:mi mathvariant="bold-italic">S</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is now the design matrix. This implies estimating only <inline-formula id="inf35">
<mml:math id="m37">
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula> effects that are projected into the <inline-formula id="inf36">
<mml:math id="m38">
<mml:mi>n</mml:mi>
</mml:math>
</inline-formula> dimensional space in order to predict <inline-formula id="inf37">
<mml:math id="m39">
<mml:mi mathvariant="bold-italic">u</mml:mi>
</mml:math>
</inline-formula> and explain <inline-formula id="inf38">
<mml:math id="m40">
<mml:mi mathvariant="bold-italic">y</mml:mi>
</mml:math>
</inline-formula>. Note that model (2) can be implemented under a conventional mixed model framework or under any statistical machine learning algorithm assuming that the <inline-formula id="inf39">
<mml:math id="m41">
<mml:mi mathvariant="bold-italic">f</mml:mi>
</mml:math>
</inline-formula> term of <xref ref-type="disp-formula" rid="e2">Equation 2</xref> is a fixed effect. For example, under a linear kernel the <inline-formula id="inf40">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf41">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> can be computed as <inline-formula id="inf42">
<mml:math id="m44">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">X</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">X</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
<mml:mi>p</mml:mi>
</mml:mfrac>
</mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:math>
</inline-formula> and <inline-formula id="inf43">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">X</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mi mathvariant="bold-italic">X</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
<mml:mi>p</mml:mi>
</mml:mfrac>
</mml:mrow>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mo>&#xa0;</mml:mo>
</mml:math>
</inline-formula> respectively, where <inline-formula id="inf44">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">X</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the centered and scaled matrix of markers with <inline-formula id="inf45">
<mml:math id="m47">
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula> lines and <inline-formula id="inf46">
<mml:math id="m48">
<mml:mi>p</mml:mi>
</mml:math>
</inline-formula> markers, and <inline-formula id="inf47">
<mml:math id="m49">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">X</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the centered and scaled matrix of markers with <inline-formula id="inf48">
<mml:math id="m50">
<mml:mi>n</mml:mi>
</mml:math>
</inline-formula> lines and <inline-formula id="inf49">
<mml:math id="m51">
<mml:mi>p</mml:mi>
</mml:math>
</inline-formula> markers. In summary, according to <xref ref-type="bibr" rid="B6">Cuevas et al. (2020)</xref>, the approximation described above consists of the following steps:<list list-type="simple">
<list-item>
<p>Step 1: Computing the following matrices, matrix <inline-formula id="inf50">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from <inline-formula id="inf51">
<mml:math id="m53">
<mml:mi>m</mml:mi>
</mml:math>
</inline-formula> lines of the training set.</p>
</list-item>
<list-item>
<p>Step 2: Constructing matrix <inline-formula id="inf52">
<mml:math id="m54">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>Step 3: Eigen value decomposition of <inline-formula id="inf53">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>Step 4: Computing matrix <inline-formula id="inf54">
<mml:math id="m56">
<mml:mrow>
<mml:mi mathvariant="bold-italic">P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi mathvariant="bold-italic">U</mml:mi>
<mml:msup>
<mml:mi mathvariant="bold-italic">S</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
<list-item>
<p>Step 5: Fitting the model under any of the above mentioned statistical machine learning using <inline-formula id="inf55">
<mml:math id="m57">
<mml:mrow>
<mml:mi mathvariant="bold-italic">P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">K</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi mathvariant="bold-italic">U</mml:mi>
<mml:msup>
<mml:mi mathvariant="bold-italic">S</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> as design matrix and <inline-formula id="inf56">
<mml:math id="m58">
<mml:mi mathvariant="bold-italic">y</mml:mi>
</mml:math>
</inline-formula> as response variable.</p>
</list-item>
</list>
</p>
<p>One of the major advantages of the sparse kernels is data dimensionality reduction since the number of parameters to be estimated is reduced significantly in comparison to the dense kernels. This is useful when working with high dimensional data where the number of columns is considerably greater than the number of rows, as there is few data, and the training process of the model is more efficient. More details about the kernels and the approximated kernels, here called sparse kernels, that were implemented in the SKM library can be found in detail in <xref ref-type="bibr" rid="B18">Montesinos-L&#xf3;pez et al. (2021b)</xref> and <xref ref-type="bibr" rid="B20">Montesinos-L&#xf3;pez, et al. (2022a)</xref>.</p>
<p>In <xref ref-type="sec" rid="s15">Supplementary Appendix SB</xref> we have included some examples of how to use the kernelize function of SKM to compute the different kernels and their sparse versions.</p>
</sec>
<sec id="s4">
<title>Evaluation Metrics</title>
<p>Evaluating models&#x2019; performance is an important task of all machine learning workflows. For this reason, in SKM we have included functions of the most popular metrics to evaluate models&#x2019; performance for both regression and classification problems. The regression metrics included are: Mean Squared Error (MSE), Root Mean Squared Error (RMSE), Normalized Root Mean Squared Error (NRMSE, with four types of normalization: by standard deviation, mean, range and interquartile range), Mean Absolute Error (MAE) and Mean Arctangent Absolute Percentage Error (MAAPE). The classification metrics included are: accuracy, specificity, sensitivity, Kappa coefficient, Brier score, Matthews correlation coefficient, precision, recall, Area Under the ROC Curve (ROC-AUC), Precision-Recall Area Under the Curve (PR-AUC), F1 score and a function to compute the confusion matrix. In addition to the functions already mentioned, the wrapper functions numeric summary and categorical summary compute all the regression and classification metrics to obtain a complete summary of the model&#x2019;s performance in a simple function. More details about most of these metrics can be found in chapter 4 (Overfitting, model tuning and evaluation of prediction performance) of the book Multivariate statistical machine learning methods for genomic prediction (<xref ref-type="bibr" rid="B19">Montesinos-L&#xf3;pez et al., 2022b</xref>).</p>
<p>As expected, all these metric functions work in harmony with the machine learning algorithm functions since they use the same data format; no extra data processing is necessary when they are used correctly. This does not limit or complicate their use with other packages, as shown in the detailed documentation provided.</p>
<p>
<xref ref-type="sec" rid="s15">Supplementary Appendices SA, SB</xref> include examples of some metric functions that receive the observed and predicted values (or probabilities in classification) and return a numeric value.</p>
</sec>
<sec id="s5">
<title>Installation</title>
<p>SKM is a package built for the R ecosystem. As an open source project, the package has first been published in a GitHub repository at <ext-link ext-link-type="uri" xlink:href="https://github.com/brandon-mosqueda/SKM">https://github.com/brandon-mosqueda/SKM</ext-link> where the full source code and another option of installing the development version (and most updated) can be found. This development version may include corrections of reported bugs and new functionalities, among others. Likewise, in the repository users can also find a place to report bugs or contribute to the project. In order to install the development version, the following commands must be executed in an R terminal.</p>
<p>
<monospace>devtools::install_github ("cran/randomForestSRC")</monospace>
</p>
<p>
<monospace>devtools::install_github ("gdlc/BGLR-R")</monospace>
</p>
<p>
<monospace>devtools::install_github ("rstudio/tensorflow")</monospace>
</p>
<p>
<monospace>if (!require ("devtools")) {install.packages ("devtools")}</monospace>
</p>
<p>
<monospace>devtools::install_github ("brandon-mosqueda/SKM")</monospace>
</p>
</sec>
<sec id="s6">
<title>Illustrative Examples</title>
<p>Next, we will illustrate the use of the SKM library with two popular data sets in genomic selection using 5-random partitions to evaluate the prediction performance with the two available tuning options. The response variables in both datasets are numeric response variables, and as such, we present the prediction performance in terms of Mean Arctangent Absolute Percentage Error (MAAPE), Mean Absolute Error (MAE), Mean Squared Error (MSE) and Normalized Root Mean Squared Error (NRMSE). We have included a function in SKM to compute summaries for prediction performance with genomic selection data (summaries). This function requires a by data. frame with whole predictions in different folds, including genotype and environment information; this is used in all the examples described below.</p>
<sec id="s6-1">
<title>Wheat Data</title>
<p>This data set was first used by <xref ref-type="bibr" rid="B3">Crossa et al. (2010)</xref> and <xref ref-type="bibr" rid="B5">Cuevas et al. (2016)</xref>, <xref ref-type="bibr" rid="B4">Cuevas et al. (2017)</xref> and <xref ref-type="bibr" rid="B7">Cuevas et al. (2019)</xref> and is comprised of 599 wheat lines from the CIMMYT Global Wheat Program evaluated in four international environments representing four basic agroclimatic regions (mega-environments). The phenotypic trait considered for the 599 wheat lines evaluated in each of the four mega-environments was grain yield (GY). The 599 wheat lines were genotyped using 1447 Diversity Array Technology (DArT) markers generated by Triticarte Pty. Ltd.</p>
<p>In this example we evaluated the six models included in the package, each one using Bayesian optimization to tune its specific hyperparameters, with the exception of Bayesian methods (model M4), which do not require hyperparameter tuning. The cross-validation used to evaluate the predictions&#x2019; accuracy was with five random (splits) partitions, where 80% of the data was used for training and 20% for the testing set, and the average of the five testing sets was reported as prediction performance. To tune the hyperparameters, an inner 5-fold cross validation was also used to evaluate each hyperparameter combination. It is important to point out that the inner 5-fold cross validation is implemented in each partition, which in this case, contains only 80% of the data. In this regard, each inner training contains only 64% of the data while the validation set contains only 16% of the data. In <xref ref-type="table" rid="T2">Table 2</xref>, the evaluation results are presented for the wheat data set, while the code for implementing the six models is given in <xref ref-type="sec" rid="s15">Supplementary Appendix SC</xref>.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Prediction performance of the Wheat data set for each environment and across environments (Global) of each of the six models.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Model</th>
<th rowspan="2" align="center">Metric</th>
<th colspan="2" align="center">E1</th>
<th colspan="2" align="center">E2</th>
<th colspan="2" align="center">E3</th>
<th colspan="2" align="center">E4</th>
<th colspan="2" align="center">Global</th>
</tr>
<tr>
<th align="center">Mean</th>
<th align="center">SE</th>
<th align="center">Mean</th>
<th align="center">SE</th>
<th align="center">Mean</th>
<th align="center">SE</th>
<th align="center">Mean</th>
<th align="center">SE</th>
<th align="center">Mean</th>
<th align="center">SE</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">M1</td>
<td>MAAPE</td>
<td align="char" char=".">0.7307</td>
<td align="char" char=".">0.0069</td>
<td align="char" char=".">0.6852</td>
<td align="char" char=".">0.0210</td>
<td align="char" char=".">0.6993</td>
<td align="char" char=".">0.0188</td>
<td align="char" char=".">0.6922</td>
<td align="char" char=".">0.0104</td>
<td align="char" char=".">0.7082</td>
<td align="char" char=".">0.0090</td>
</tr>
<tr>
<td align="left">M1</td>
<td>MAE</td>
<td align="char" char=".">0.6801</td>
<td align="char" char=".">0.0133</td>
<td align="char" char=".">0.6360</td>
<td align="char" char=".">0.0272</td>
<td align="char" char=".">0.6644</td>
<td align="char" char=".">0.0314</td>
<td align="char" char=".">0.5935</td>
<td align="char" char=".">0.0162</td>
<td align="char" char=".">0.5955</td>
<td align="char" char=".">0.0123</td>
</tr>
<tr>
<td align="left">M1</td>
<td>MSE</td>
<td align="char" char=".">0.7359</td>
<td align="char" char=".">0.0210</td>
<td align="char" char=".">0.6931</td>
<td align="char" char=".">0.0494</td>
<td align="char" char=".">0.7908</td>
<td align="char" char=".">0.0820</td>
<td align="char" char=".">0.6007</td>
<td align="char" char=".">0.0332</td>
<td align="char" char=".">0.5951</td>
<td align="char" char=".">0.0233</td>
</tr>
<tr>
<td align="left">M1</td>
<td>NRMSE</td>
<td align="char" char=".">0.8575</td>
<td align="char" char=".">0.0174</td>
<td align="char" char=".">0.8173</td>
<td align="char" char=".">0.0263</td>
<td align="char" char=".">0.8763</td>
<td align="char" char=".">0.0271</td>
<td align="char" char=".">0.7915</td>
<td align="char" char=".">0.0157</td>
<td align="char" char=".">0.8316</td>
<td align="char" char=".">0.0081</td>
</tr>
<tr>
<td align="left">M1</td>
<td>RMSE</td>
<td align="char" char=".">0.8575</td>
<td align="char" char=".">0.0123</td>
<td align="char" char=".">0.8304</td>
<td align="char" char=".">0.0298</td>
<td align="char" char=".">0.8839</td>
<td align="char" char=".">0.0490</td>
<td align="char" char=".">0.7738</td>
<td align="char" char=".">0.0221</td>
<td align="char" char=".">0.7708</td>
<td align="char" char=".">0.0155</td>
</tr>
<tr>
<td align="left">M2</td>
<td>MAAPE</td>
<td align="char" char=".">0.7134</td>
<td align="char" char=".">0.0118</td>
<td align="char" char=".">0.7506</td>
<td align="char" char=".">0.0107</td>
<td align="char" char=".">0.7460</td>
<td align="char" char=".">0.0118</td>
<td align="char" char=".">0.7635</td>
<td align="char" char=".">0.0049</td>
<td align="char" char=".">0.7500</td>
<td align="char" char=".">0.0112</td>
</tr>
<tr>
<td align="left">M2</td>
<td>MAE</td>
<td align="char" char=".">0.7023</td>
<td align="char" char=".">0.0303</td>
<td align="char" char=".">0.7116</td>
<td align="char" char=".">0.0292</td>
<td align="char" char=".">0.7670</td>
<td align="char" char=".">0.0283</td>
<td align="char" char=".">0.7179</td>
<td align="char" char=".">0.0131</td>
<td align="char" char=".">0.6748</td>
<td align="char" char=".">0.0163</td>
</tr>
<tr>
<td align="left">M2</td>
<td>MSE</td>
<td align="char" char=".">0.7845</td>
<td align="char" char=".">0.0527</td>
<td align="char" char=".">0.8980</td>
<td align="char" char=".">0.0646</td>
<td align="char" char=".">0.9876</td>
<td align="char" char=".">0.0746</td>
<td align="char" char=".">0.8581</td>
<td align="char" char=".">0.0237</td>
<td align="char" char=".">0.7645</td>
<td align="char" char=".">0.0289</td>
</tr>
<tr>
<td align="left">M2</td>
<td>NRMSE</td>
<td align="char" char=".">0.8747</td>
<td align="char" char=".">0.0193</td>
<td align="char" char=".">0.9429</td>
<td align="char" char=".">0.0200</td>
<td align="char" char=".">0.9622</td>
<td align="char" char=".">0.0145</td>
<td align="char" char=".">0.9344</td>
<td align="char" char=".">0.0123</td>
<td align="char" char=".">0.9311</td>
<td align="char" char=".">0.0116</td>
</tr>
<tr>
<td align="left">M2</td>
<td>RMSE</td>
<td align="char" char=".">0.8836</td>
<td align="char" char=".">0.0305</td>
<td align="char" char=".">0.9450</td>
<td align="char" char=".">0.0349</td>
<td align="char" char=".">0.9909</td>
<td align="char" char=".">0.0380</td>
<td align="char" char=".">0.9260</td>
<td align="char" char=".">0.0129</td>
<td align="char" char=".">0.8737</td>
<td align="char" char=".">0.0167</td>
</tr>
<tr>
<td align="left">M3</td>
<td>MAAPE</td>
<td align="char" char=".">0.7857</td>
<td align="char" char=".">0.0038</td>
<td align="char" char=".">0.7835</td>
<td align="char" char=".">0.0056</td>
<td align="char" char=".">0.7877</td>
<td align="char" char=".">0.0010</td>
<td align="char" char=".">0.7848</td>
<td align="char" char=".">0.0019</td>
<td align="char" char=".">0.7856</td>
<td align="char" char=".">0.0015</td>
</tr>
<tr>
<td align="left">M3</td>
<td>MAE</td>
<td align="char" char=".">0.7675</td>
<td align="char" char=".">0.0186</td>
<td align="char" char=".">0.7972</td>
<td align="char" char=".">0.0267</td>
<td align="char" char=".">0.7766</td>
<td align="char" char=".">0.0082</td>
<td align="char" char=".">0.7805</td>
<td align="char" char=".">0.0228</td>
<td align="char" char=".">0.7341</td>
<td align="char" char=".">0.0133</td>
</tr>
<tr>
<td align="left">M3</td>
<td>MSE</td>
<td align="char" char=".">0.9014</td>
<td align="char" char=".">0.0324</td>
<td align="char" char=".">1.0875</td>
<td align="char" char=".">0.0656</td>
<td align="char" char=".">0.9583</td>
<td align="char" char=".">0.0268</td>
<td align="char" char=".">1.0724</td>
<td align="char" char=".">0.0343</td>
<td align="char" char=".">0.9035</td>
<td align="char" char=".">0.0271</td>
</tr>
<tr>
<td align="left">M3</td>
<td>NRMSE</td>
<td align="char" char=".">0.9997</td>
<td align="char" char=".">0.0013</td>
<td align="char" char=".">1.0013</td>
<td align="char" char=".">0.0021</td>
<td align="char" char=".">1.0012</td>
<td align="char" char=".">0.0045</td>
<td align="char" char=".">1.0027</td>
<td align="char" char=".">0.0017</td>
<td align="char" char=".">1.0004</td>
<td align="char" char=".">0.0004</td>
</tr>
<tr>
<td align="left">M3</td>
<td>RMSE</td>
<td align="char" char=".">0.9488</td>
<td align="char" char=".">0.0171</td>
<td align="char" char=".">1.0409</td>
<td align="char" char=".">0.0320</td>
<td align="char" char=".">0.9785</td>
<td align="char" char=".">0.0138</td>
<td align="char" char=".">1.0350</td>
<td align="char" char=".">0.0166</td>
<td align="char" char=".">0.9501</td>
<td align="char" char=".">0.0142</td>
</tr>
<tr>
<td align="left">M4</td>
<td>MAAPE</td>
<td align="char" char=".">0.7161</td>
<td align="char" char=".">0.0134</td>
<td align="char" char=".">0.6835</td>
<td align="char" char=".">0.0169</td>
<td align="char" char=".">0.6902</td>
<td align="char" char=".">0.0128</td>
<td align="char" char=".">0.6898</td>
<td align="char" char=".">0.0204</td>
<td align="char" char=".">0.6965</td>
<td align="char" char=".">0.0100</td>
</tr>
<tr>
<td align="left">M4</td>
<td>MAE</td>
<td align="char" char=".">0.6733</td>
<td align="char" char=".">0.0273</td>
<td align="char" char=".">0.6258</td>
<td align="char" char=".">0.0081</td>
<td align="char" char=".">0.7060</td>
<td align="char" char=".">0.0196</td>
<td align="char" char=".">0.5945</td>
<td align="char" char=".">0.0094</td>
<td align="char" char=".">0.5864</td>
<td align="char" char=".">0.0083</td>
</tr>
<tr>
<td align="left">M4</td>
<td>MSE</td>
<td align="char" char=".">0.7063</td>
<td align="char" char=".">0.0450</td>
<td align="char" char=".">0.6793</td>
<td align="char" char=".">0.0049</td>
<td align="char" char=".">0.8221</td>
<td align="char" char=".">0.0494</td>
<td align="char" char=".">0.6291</td>
<td align="char" char=".">0.0409</td>
<td align="char" char=".">0.5769</td>
<td align="char" char=".">0.0186</td>
</tr>
<tr>
<td align="left">M4</td>
<td>NRMSE</td>
<td align="char" char=".">0.8472</td>
<td align="char" char=".">0.0163</td>
<td align="char" char=".">0.8105</td>
<td align="char" char=".">0.0159</td>
<td align="char" char=".">0.8470</td>
<td align="char" char=".">0.0178</td>
<td align="char" char=".">0.7963</td>
<td align="char" char=".">0.0151</td>
<td align="char" char=".">0.8123</td>
<td align="char" char=".">0.0080</td>
</tr>
<tr>
<td align="left">M4</td>
<td>RMSE</td>
<td align="char" char=".">0.8387</td>
<td align="char" char=".">0.0264</td>
<td align="char" char=".">0.8242</td>
<td align="char" char=".">0.0030</td>
<td align="char" char=".">0.9050</td>
<td align="char" char=".">0.0275</td>
<td align="char" char=".">0.7915</td>
<td align="char" char=".">0.0252</td>
<td align="char" char=".">0.7592</td>
<td align="char" char=".">0.0123</td>
</tr>
<tr>
<td align="left">M5</td>
<td>MAAPE</td>
<td align="char" char=".">0.7133</td>
<td align="char" char=".">0.0108</td>
<td align="char" char=".">0.6956</td>
<td align="char" char=".">0.0107</td>
<td align="char" char=".">0.7233</td>
<td align="char" char=".">0.0067</td>
<td align="char" char=".">0.7455</td>
<td align="char" char=".">0.0046</td>
<td align="char" char=".">0.7211</td>
<td align="char" char=".">0.0043</td>
</tr>
<tr>
<td align="left">M5</td>
<td>MAE</td>
<td align="char" char=".">0.7141</td>
<td align="char" char=".">0.0183</td>
<td align="char" char=".">0.6336</td>
<td align="char" char=".">0.0116</td>
<td align="char" char=".">0.6846</td>
<td align="char" char=".">0.0272</td>
<td align="char" char=".">0.6572</td>
<td align="char" char=".">0.0291</td>
<td align="char" char=".">0.6156</td>
<td align="char" char=".">0.0056</td>
</tr>
<tr>
<td align="left">M5</td>
<td>MSE</td>
<td align="char" char=".">0.7987</td>
<td align="char" char=".">0.0387</td>
<td align="char" char=".">0.6587</td>
<td align="char" char=".">0.0170</td>
<td align="char" char=".">0.7696</td>
<td align="char" char=".">0.0607</td>
<td align="char" char=".">0.7021</td>
<td align="char" char=".">0.0639</td>
<td align="char" char=".">0.6183</td>
<td align="char" char=".">0.0104</td>
</tr>
<tr>
<td align="left">M5</td>
<td>NRMSE</td>
<td align="char" char=".">0.8796</td>
<td align="char" char=".">0.0230</td>
<td align="char" char=".">0.8168</td>
<td align="char" char=".">0.0220</td>
<td align="char" char=".">0.8742</td>
<td align="char" char=".">0.0121</td>
<td align="char" char=".">0.8808</td>
<td align="char" char=".">0.0148</td>
<td align="char" char=".">0.8547</td>
<td align="char" char=".">0.0081</td>
</tr>
<tr>
<td align="left">M5</td>
<td>RMSE</td>
<td align="char" char=".">0.8927</td>
<td align="char" char=".">0.0212</td>
<td align="char" char=".">0.8113</td>
<td align="char" char=".">0.0104</td>
<td align="char" char=".">0.8744</td>
<td align="char" char=".">0.0355</td>
<td align="char" char=".">0.8346</td>
<td align="char" char=".">0.0369</td>
<td align="char" char=".">0.7862</td>
<td align="char" char=".">0.0066</td>
</tr>
<tr>
<td align="left">M6</td>
<td>MAAPE</td>
<td align="char" char=".">0.7056</td>
<td align="char" char=".">0.0071</td>
<td align="char" char=".">0.6991</td>
<td align="char" char=".">0.0107</td>
<td align="char" char=".">0.7149</td>
<td align="char" char=".">0.0132</td>
<td align="char" char=".">0.7058</td>
<td align="char" char=".">0.0037</td>
<td align="char" char=".">0.7075</td>
<td align="char" char=".">0.0067</td>
</tr>
<tr>
<td align="left">M6</td>
<td>MAE</td>
<td align="char" char=".">0.6938</td>
<td align="char" char=".">0.0144</td>
<td align="char" char=".">0.6358</td>
<td align="char" char=".">0.0204</td>
<td align="char" char=".">0.6802</td>
<td align="char" char=".">0.0280</td>
<td align="char" char=".">0.6327</td>
<td align="char" char=".">0.0103</td>
<td align="char" char=".">0.6170</td>
<td align="char" char=".">0.0122</td>
</tr>
<tr>
<td align="left">M6</td>
<td>MSE</td>
<td align="char" char=".">0.8160</td>
<td align="char" char=".">0.0499</td>
<td align="char" char=".">0.6978</td>
<td align="char" char=".">0.0452</td>
<td align="char" char=".">0.7807</td>
<td align="char" char=".">0.0678</td>
<td align="char" char=".">0.7183</td>
<td align="char" char=".">0.0226</td>
<td align="char" char=".">0.6645</td>
<td align="char" char=".">0.0355</td>
</tr>
<tr>
<td align="left">M6</td>
<td>NRMSE</td>
<td align="char" char=".">0.8918</td>
<td align="char" char=".">0.0067</td>
<td align="char" char=".">0.8385</td>
<td align="char" char=".">0.0188</td>
<td align="char" char=".">0.8889</td>
<td align="char" char=".">0.0230</td>
<td align="char" char=".">0.8534</td>
<td align="char" char=".">0.0167</td>
<td align="char" char=".">0.8669</td>
<td align="char" char=".">0.0119</td>
</tr>
<tr>
<td align="left">M6</td>
<td>RMSE</td>
<td align="char" char=".">0.9016</td>
<td align="char" char=".">0.0279</td>
<td align="char" char=".">0.8336</td>
<td align="char" char=".">0.0267</td>
<td align="char" char=".">0.8802</td>
<td align="char" char=".">0.0386</td>
<td align="char" char=".">0.8471</td>
<td align="char" char=".">0.0133</td>
<td align="char" char=".">0.8140</td>
<td align="char" char=".">0.0217</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Generalized boosted machines (M1), generalized linear models (M2), support vector machines (M3), random forest (M4), Bayesian regression models (M5) and deep neural networks (M6). The tuning process was done under the Bayesian optimization framework. Mean is the average of the five partitions for each metric and SE denotes the standard error for each metric. E1-E4 denotes location1, location2, location3 and location4.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In <xref ref-type="fig" rid="F1">Figures 1</xref>, <xref ref-type="fig" rid="F2">2</xref> we compare the prediction performance of the six evaluated models across environments in terms of MSE and NRMSE, respectively. Both figures show a similar pattern in the prediction performance results. In terms of both metrics, M4, M1 and M5 produced the best prediction performance. In terms of MSE, the best model (M4) outperformed M1 by <inline-formula id="inf57">
<mml:math id="m59">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0.5951</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.5769</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>0.5951</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>3.05</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, M2 by <inline-formula id="inf58">
<mml:math id="m60">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0.7645</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.5769</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>0.7645</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>32.51</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, M3 (the worst) by <inline-formula id="inf59">
<mml:math id="m61">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0.9035</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.5769</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>0.9035</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>36.14</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, M5 by <inline-formula id="inf60">
<mml:math id="m62">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0.6183</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.5769</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>0.6183</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>6.69</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and M6 by <inline-formula id="inf61">
<mml:math id="m63">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0.6645</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.5769</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>0.6645</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>13.18</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Regarding NRMSE, the outperformance between models is not as large as in MSE terms. For example, the outperformance between the best (M4) and worst (M3) was <inline-formula id="inf62">
<mml:math id="m64">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.8123</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mn>1</mml:mn>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>18.77</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, significantly different from the 36.14% in MSE terms. It should be noted that the model M5 was implemented in all the examples provided with Bayesian Ridge Regression (BRR; that works with the scaled matrix of markers <inline-formula id="inf63">
<mml:math id="m65">
<mml:mi>Z</mml:mi>
</mml:math>
</inline-formula>), which is equivalent to BGBLUP [that works with the linear kernel computed as <inline-formula id="inf64">
<mml:math id="m66">
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mi>Z</mml:mi>
<mml:msup>
<mml:mi>Z</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>Z</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>]. As mentioned before, the other Bayesian methods can be implemented by merely changing &#x201c;BRR&#x201d; in model to the other available options like: Bayes_A, Bayes_B, Bayes_C, Bayes_Lasso and BGLUP (See <xref ref-type="sec" rid="s15">Supplementary Appendix SB5</xref>. Bayesian regression model).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Prediction performance in terms of Mean Squared Error of the six models (M1, M2, M3, M4, M5, M6) across environments (Global) in the wheat data. M1 denotes the generalized boosted machine model, M2 denotes the generalized linear model, M3 denotes the support vector machine model, M4 denotes the random forest model, M5 denotes the Bayesian regression model and M6 denotes the deep neural networks model.</p>
</caption>
<graphic xlink:href="fgene-13-887643-g001.tif"/>
</fig>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Prediction performance in terms of Normalized Root Mean Squared Error of the six models (M1, M2, M3, M4, M5, M6) across environments (Global) in the wheat data. M1 denotes the generalized boosted machine model, M2 denotes the generalized linear model, M3 denotes the support vector machine model, M4 denotes the random forest model, M5 denotes the Bayesian regression model and M6 denotes the deep neural networks model.</p>
</caption>
<graphic xlink:href="fgene-13-887643-g002.tif"/>
</fig>
</sec>
<sec id="s6-2">
<title>Maize Data</title>
<p>This maize data set was included in <xref ref-type="bibr" rid="B29">Souza et al. (2017)</xref> and comes from USP (Universidad Sao Paulo). It consists of 722 (with 722 <inline-formula id="inf65">
<mml:math id="m67">
<mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> &#x3d; 2888 observations) maize hybrids obtained by crossing 49 inbred lines. The hybrids were evaluated in four environments (E1-E4) in Piracicaba and Anhumas, S&#xe3;o Paulo, Brazil, in 2016. The hybrids were evaluated using an augmented block design with two commercial hybrids as checks to correct for micro-environmental variation. At each site, two levels of nitrogen (N) fertilization were used. The experiment conducted under ideal N conditions received 100&#xa0;kg ha-1 of N (30&#xa0;kg ha-1 at sowing and 70&#xa0;kg ha-1 in a coverage application) at the V8 plant stage, while the experiment with low N received 30&#xa0;kg/ha at sowing. The parent lines were genotyped with an Affymetrix Axiom Maize Genotyping Array of 616&#xa0;K SNPs. Markers with Minor Allele Frequency (MAF) of 0.05 were removed. After applying QC, 54,113 SNPs were available to make the predictions.</p>
<p>In this second example, we evaluated the same cases as the wheat data example using the grid search as a tuning strategy for the hyperparameters. Likewise, in this data set, the prediction performance was evaluated with five random partitions where 80% of the data was used for training and 20% for the testing set and the average of the five testing sets was reported as prediction performance. To tune the hyperparameters, an inner 5-fold cross validation was also used to evaluate each hyperparameter combination. In <xref ref-type="table" rid="T3">Table 3</xref> the evaluation results are shown for this data set (Maize). The complete R code for implementing the six models in the SKM library is provided in <xref ref-type="sec" rid="s15">Supplementary Appendix SD</xref>.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Prediction performance of the Maize data set for each environment and across environments (Global) of each of the six models.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Model</th>
<th rowspan="2" align="center">Metric</th>
<th colspan="2" align="center">E1</th>
<th colspan="2" align="center">E2</th>
<th colspan="2" align="center">E3</th>
<th colspan="2" align="center">E4</th>
<th colspan="2" align="center">Global</th>
</tr>
<tr>
<th align="center">Mean</th>
<th align="center">SE</th>
<th align="center">Mean</th>
<th align="center">SE</th>
<th align="center">Mean</th>
<th align="center">SE</th>
<th align="center">Mean</th>
<th align="center">SE</th>
<th align="center">Mean</th>
<th align="center">SE</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">M1</td>
<td>MAE</td>
<td align="char" char=".">0.2038</td>
<td align="char" char=".">0.0024</td>
<td align="char" char=".">0.4360</td>
<td align="char" char=".">0.0122</td>
<td align="char" char=".">0.2708</td>
<td align="char" char=".">0.0035</td>
<td align="char" char=".">0.5392</td>
<td align="char" char=".">0.0088</td>
<td align="char" char=".">0.3409</td>
<td align="char" char=".">0.0047</td>
</tr>
<tr>
<td align="left">M1</td>
<td>MSE</td>
<td align="char" char=".">0.0700</td>
<td align="char" char=".">0.0028</td>
<td align="char" char=".">0.2991</td>
<td align="char" char=".">0.0179</td>
<td align="char" char=".">0.1125</td>
<td align="char" char=".">0.0034</td>
<td align="char" char=".">0.4670</td>
<td align="char" char=".">0.0177</td>
<td align="char" char=".">0.2059</td>
<td align="char" char=".">0.0059</td>
</tr>
<tr>
<td align="left">M1</td>
<td>NRMSE</td>
<td align="char" char=".">0.8751</td>
<td align="char" char=".">0.0259</td>
<td align="char" char=".">0.9021</td>
<td align="char" char=".">0.0077</td>
<td align="char" char=".">0.9159</td>
<td align="char" char=".">0.0135</td>
<td align="char" char=".">0.9146</td>
<td align="char" char=".">0.0196</td>
<td align="char" char=".">0.8872</td>
<td align="char" char=".">0.0147</td>
</tr>
<tr>
<td align="left">M1</td>
<td>RMSE</td>
<td align="char" char=".">0.2644</td>
<td align="char" char=".">0.0051</td>
<td align="char" char=".">0.5459</td>
<td align="char" char=".">0.0166</td>
<td align="char" char=".">0.3352</td>
<td align="char" char=".">0.0050</td>
<td align="char" char=".">0.6829</td>
<td align="char" char=".">0.0129</td>
<td align="char" char=".">0.4535</td>
<td align="char" char=".">0.0066</td>
</tr>
<tr>
<td align="left">M2</td>
<td>MAAPE</td>
<td align="char" char=".">0.7672</td>
<td align="char" char=".">0.0106</td>
<td align="char" char=".">0.7787</td>
<td align="char" char=".">0.0120</td>
<td align="char" char=".">0.7734</td>
<td align="char" char=".">0.0102</td>
<td align="char" char=".">0.7580</td>
<td align="char" char=".">0.0075</td>
<td align="char" char=".">0.7565</td>
<td align="char" char=".">0.0064</td>
</tr>
<tr>
<td align="left">M2</td>
<td>MAE</td>
<td align="char" char=".">0.2040</td>
<td align="char" char=".">0.0067</td>
<td align="char" char=".">0.4687</td>
<td align="char" char=".">0.0144</td>
<td align="char" char=".">0.2700</td>
<td align="char" char=".">0.0060</td>
<td align="char" char=".">0.5751</td>
<td align="char" char=".">0.0166</td>
<td align="char" char=".">0.3592</td>
<td align="char" char=".">0.0072</td>
</tr>
<tr>
<td align="left">M2</td>
<td>MSE</td>
<td align="char" char=".">0.0713</td>
<td align="char" char=".">0.0045</td>
<td align="char" char=".">0.3460</td>
<td align="char" char=".">0.0157</td>
<td align="char" char=".">0.1131</td>
<td align="char" char=".">0.0064</td>
<td align="char" char=".">0.5281</td>
<td align="char" char=".">0.0319</td>
<td align="char" char=".">0.2336</td>
<td align="char" char=".">0.0126</td>
</tr>
<tr>
<td align="left">M2</td>
<td>NRMSE</td>
<td align="char" char=".">0.9174</td>
<td align="char" char=".">0.0119</td>
<td align="char" char=".">0.9687</td>
<td align="char" char=".">0.0057</td>
<td align="char" char=".">0.9353</td>
<td align="char" char=".">0.0163</td>
<td align="char" char=".">0.9517</td>
<td align="char" char=".">0.0070</td>
<td align="char" char=".">0.9498</td>
<td align="char" char=".">0.0040</td>
</tr>
<tr>
<td align="left">M2</td>
<td>RMSE</td>
<td align="char" char=".">0.2664</td>
<td align="char" char=".">0.0083</td>
<td align="char" char=".">0.5876</td>
<td align="char" char=".">0.0134</td>
<td align="char" char=".">0.3358</td>
<td align="char" char=".">0.0095</td>
<td align="char" char=".">0.7253</td>
<td align="char" char=".">0.0225</td>
<td align="char" char=".">0.4826</td>
<td align="char" char=".">0.0127</td>
</tr>
<tr>
<td align="left">M3</td>
<td>MAAPE</td>
<td align="char" char=".">0.7861</td>
<td align="char" char=".">0.0059</td>
<td align="char" char=".">0.7829</td>
<td align="char" char=".">0.0010</td>
<td align="char" char=".">0.7871</td>
<td align="char" char=".">0.0031</td>
<td align="char" char=".">0.7870</td>
<td align="char" char=".">0.0023</td>
<td align="char" char=".">0.7852</td>
<td align="char" char=".">0.0015</td>
</tr>
<tr>
<td align="left">M3</td>
<td>MAE</td>
<td align="char" char=".">0.2187</td>
<td align="char" char=".">0.0054</td>
<td align="char" char=".">0.4814</td>
<td align="char" char=".">0.0130</td>
<td align="char" char=".">0.2855</td>
<td align="char" char=".">0.0049</td>
<td align="char" char=".">0.6109</td>
<td align="char" char=".">0.0151</td>
<td align="char" char=".">0.3817</td>
<td align="char" char=".">0.0069</td>
</tr>
<tr>
<td align="left">M3</td>
<td>MSE</td>
<td align="char" char=".">0.0847</td>
<td align="char" char=".">0.0034</td>
<td align="char" char=".">0.3701</td>
<td align="char" char=".">0.0144</td>
<td align="char" char=".">0.1287</td>
<td align="char" char=".">0.0051</td>
<td align="char" char=".">0.5861</td>
<td align="char" char=".">0.0325</td>
<td align="char" char=".">0.2603</td>
<td align="char" char=".">0.0131</td>
</tr>
<tr>
<td align="left">M3</td>
<td>NRMSE</td>
<td align="char" char=".">1.0023</td>
<td align="char" char=".">0.0027</td>
<td align="char" char=".">1.0024</td>
<td align="char" char=".">0.0031</td>
<td align="char" char=".">0.9985</td>
<td align="char" char=".">0.0010</td>
<td align="char" char=".">1.0032</td>
<td align="char" char=".">0.0013</td>
<td align="char" char=".">1.0029</td>
<td align="char" char=".">0.0014</td>
</tr>
<tr>
<td align="left">M3</td>
<td>RMSE</td>
<td align="char" char=".">0.2908</td>
<td align="char" char=".">0.0059</td>
<td align="char" char=".">0.6079</td>
<td align="char" char=".">0.0119</td>
<td align="char" char=".">0.3584</td>
<td align="char" char=".">0.0070</td>
<td align="char" char=".">0.7643</td>
<td align="char" char=".">0.0215</td>
<td align="char" char=".">0.5095</td>
<td align="char" char=".">0.0126</td>
</tr>
<tr>
<td align="left">M4</td>
<td>MAAPE</td>
<td align="char" char=".">0.7450</td>
<td align="char" char=".">0.0146</td>
<td align="char" char=".">0.7615</td>
<td align="char" char=".">0.0114</td>
<td align="char" char=".">0.7432</td>
<td align="char" char=".">0.0150</td>
<td align="char" char=".">0.7418</td>
<td align="char" char=".">0.0077</td>
<td align="char" char=".">0.7444</td>
<td align="char" char=".">0.0063</td>
</tr>
<tr>
<td align="left">M4</td>
<td>MAE</td>
<td align="char" char=".">0.2006</td>
<td align="char" char=".">0.0053</td>
<td align="char" char=".">0.4430</td>
<td align="char" char=".">0.0100</td>
<td align="char" char=".">0.2615</td>
<td align="char" char=".">0.0069</td>
<td align="char" char=".">0.5586</td>
<td align="char" char=".">0.0119</td>
<td align="char" char=".">0.3498</td>
<td align="char" char=".">0.0034</td>
</tr>
<tr>
<td align="left">M4</td>
<td>MSE</td>
<td align="char" char=".">0.0678</td>
<td align="char" char=".">0.0048</td>
<td align="char" char=".">0.3073</td>
<td align="char" char=".">0.0136</td>
<td align="char" char=".">0.1070</td>
<td align="char" char=".">0.0062</td>
<td align="char" char=".">0.5041</td>
<td align="char" char=".">0.0223</td>
<td align="char" char=".">0.2215</td>
<td align="char" char=".">0.0054</td>
</tr>
<tr>
<td align="left">M4</td>
<td>NRMSE</td>
<td align="char" char=".">0.8882</td>
<td align="char" char=".">0.0082</td>
<td align="char" char=".">0.9320</td>
<td align="char" char=".">0.0076</td>
<td align="char" char=".">0.9032</td>
<td align="char" char=".">0.0173</td>
<td align="char" char=".">0.9052</td>
<td align="char" char=".">0.0076</td>
<td align="char" char=".">0.8997</td>
<td align="char" char=".">0.0042</td>
</tr>
<tr>
<td align="left">M4</td>
<td>RMSE</td>
<td align="char" char=".">0.2598</td>
<td align="char" char=".">0.0091</td>
<td align="char" char=".">0.5538</td>
<td align="char" char=".">0.0122</td>
<td align="char" char=".">0.3265</td>
<td align="char" char=".">0.0096</td>
<td align="char" char=".">0.7093</td>
<td align="char" char=".">0.0157</td>
<td align="char" char=".">0.4705</td>
<td align="char" char=".">0.0058</td>
</tr>
<tr>
<td align="left">M5</td>
<td>MAAPE</td>
<td align="char" char=".">0.7853</td>
<td align="char" char=".">0.0067</td>
<td align="char" char=".">0.7601</td>
<td align="char" char=".">0.0125</td>
<td align="char" char=".">0.7600</td>
<td align="char" char=".">0.0064</td>
<td align="char" char=".">0.7275</td>
<td align="char" char=".">0.0067</td>
<td align="char" char=".">0.7483</td>
<td align="char" char=".">0.0058</td>
</tr>
<tr>
<td align="left">M5</td>
<td>MAE</td>
<td align="char" char=".">0.2199</td>
<td align="char" char=".">0.0033</td>
<td align="char" char=".">0.4507</td>
<td align="char" char=".">0.0074</td>
<td align="char" char=".">0.2747</td>
<td align="char" char=".">0.0086</td>
<td align="char" char=".">0.5259</td>
<td align="char" char=".">0.0089</td>
<td align="char" char=".">0.3426</td>
<td align="char" char=".">0.0060</td>
</tr>
<tr>
<td align="left">M5</td>
<td>MSE</td>
<td align="char" char=".">0.0796</td>
<td align="char" char=".">0.0037</td>
<td align="char" char=".">0.3269</td>
<td align="char" char=".">0.0104</td>
<td align="char" char=".">0.1166</td>
<td align="char" char=".">0.0067</td>
<td align="char" char=".">0.4500</td>
<td align="char" char=".">0.0153</td>
<td align="char" char=".">0.2099</td>
<td align="char" char=".">0.0081</td>
</tr>
<tr>
<td align="left">M5</td>
<td>NRMSE</td>
<td align="char" char=".">0.9858</td>
<td align="char" char=".">0.0087</td>
<td align="char" char=".">0.9364</td>
<td align="char" char=".">0.0111</td>
<td align="char" char=".">0.9533</td>
<td align="char" char=".">0.0223</td>
<td align="char" char=".">0.8808</td>
<td align="char" char=".">0.0063</td>
<td align="char" char=".">0.9116</td>
<td align="char" char=".">0.0065</td>
</tr>
<tr>
<td align="left">M5</td>
<td>RMSE</td>
<td align="char" char=".">0.2819</td>
<td align="char" char=".">0.0065</td>
<td align="char" char=".">0.5714</td>
<td align="char" char=".">0.0091</td>
<td align="char" char=".">0.3408</td>
<td align="char" char=".">0.0100</td>
<td align="char" char=".">0.6705</td>
<td align="char" char=".">0.0113</td>
<td align="char" char=".">0.4578</td>
<td align="char" char=".">0.0089</td>
</tr>
<tr>
<td align="left">M6</td>
<td>MAAPE</td>
<td align="char" char=".">0.7980</td>
<td align="char" char=".">0.0126</td>
<td align="char" char=".">0.7792</td>
<td align="char" char=".">0.0101</td>
<td align="char" char=".">0.7819</td>
<td align="char" char=".">0.0189</td>
<td align="char" char=".">0.7681</td>
<td align="char" char=".">0.0113</td>
<td align="char" char=".">0.7747</td>
<td align="char" char=".">0.0110</td>
</tr>
<tr>
<td align="left">M6</td>
<td>MAE</td>
<td align="char" char=".">0.2177</td>
<td align="char" char=".">0.0075</td>
<td align="char" char=".">0.4843</td>
<td align="char" char=".">0.0157</td>
<td align="char" char=".">0.2907</td>
<td align="char" char=".">0.0094</td>
<td align="char" char=".">0.5653</td>
<td align="char" char=".">0.0200</td>
<td align="char" char=".">0.3655</td>
<td align="char" char=".">0.0112</td>
</tr>
<tr>
<td align="left">M6</td>
<td>MSE</td>
<td align="char" char=".">0.0798</td>
<td align="char" char=".">0.0048</td>
<td align="char" char=".">0.3775</td>
<td align="char" char=".">0.0220</td>
<td align="char" char=".">0.1398</td>
<td align="char" char=".">0.0071</td>
<td align="char" char=".">0.4992</td>
<td align="char" char=".">0.0290</td>
<td align="char" char=".">0.2396</td>
<td align="char" char=".">0.0148</td>
</tr>
<tr>
<td align="left">M6</td>
<td>NRMSE</td>
<td align="char" char=".">0.9720</td>
<td align="char" char=".">0.0214</td>
<td align="char" char=".">1.0107</td>
<td align="char" char=".">0.0114</td>
<td align="char" char=".">1.0406</td>
<td align="char" char=".">0.0199</td>
<td align="char" char=".">0.9267</td>
<td align="char" char=".">0.0249</td>
<td align="char" char=".">0.9616</td>
<td align="char" char=".">0.0215</td>
</tr>
<tr>
<td align="left">M6</td>
<td>RMSE</td>
<td align="char" char=".">0.2820</td>
<td align="char" char=".">0.0084</td>
<td align="char" char=".">0.6134</td>
<td align="char" char=".">0.0180</td>
<td align="char" char=".">0.3735</td>
<td align="char" char=".">0.0096</td>
<td align="char" char=".">0.7053</td>
<td align="char" char=".">0.0212</td>
<td align="char" char=".">0.4885</td>
<td align="char" char=".">0.0155</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Generalized boosted machines (M1), generalized linear models (M2), support vector machines (M3), random forest (M4), Bayesian regression models (M5) and deep neural networks (M6). The tuning process was done under the grid search framework. Mean is the average of the five partitions for each metric, SE denotes the standard error for each metric.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In <xref ref-type="fig" rid="F3">Figures 3</xref>, <xref ref-type="fig" rid="F4">4</xref> the Global results of the maize data example are presented. <xref ref-type="fig" rid="F3">Figure 3</xref> shows the prediction performance in terms of MSE and <xref ref-type="fig" rid="F4">Figure 4</xref> the prediction performance in terms of NRMSE. According to <xref ref-type="fig" rid="F3">Figure 3</xref>, the best Global results were observed in M1 with 0.2059 of MSE followed by M5 0.2099, that is <inline-formula id="inf66">
<mml:math id="m68">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0.2099</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.2059</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>0.2099</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.9</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> worst. M1 outperformed M4 by <inline-formula id="inf67">
<mml:math id="m69">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0.2215</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.2059</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>0.2215</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>7.57</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, M2 by <inline-formula id="inf68">
<mml:math id="m70">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0.2336</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.2059</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>0.2336</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>11.85</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>%, M6 by <inline-formula id="inf69">
<mml:math id="m71">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0.2396</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.2059</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>0.2396</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>16.36</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and M3 (the worst) by <inline-formula id="inf70">
<mml:math id="m72">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>0.2603</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>0.2059</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>0.2603</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>20.89</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. In <xref ref-type="fig" rid="F4">Figure 4</xref> a similar pattern appears: M1 produced the best results since it has the lowest NRMSE. The only change in the order compared to that observed in <xref ref-type="fig" rid="F3">Figure 3</xref> is that M4 outperformed M5 in terms of NRMSE. The remaining models&#x2019; results agree with <xref ref-type="fig" rid="F3">Figure 3</xref> given that the following best results in terms of NRMSE were obtained with M2, M3 and M6, respectively.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Prediction performance in terms of Mean Squared Error of prediction of the six models (M1, M2, M3, M4, M5, M6) across environment (Global) in Maize data. M1 denotes the generalized boosted machine model, M2 denotes the generalized linear model, M3 denotes the support vector machine model, M4 denotes the random forest model, M5 denotes the Bayesian regression model and M6 denotes the deep neural networks model.</p>
</caption>
<graphic xlink:href="fgene-13-887643-g003.tif"/>
</fig>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Prediction performance in terms of Normalized Root Mean Squared Error of the six models (M1, M2, M3, M4, M5, M6) across environments (Global) in Maize data. M1 denotes the generalized boosted machine model, M2 denotes the generalized linear model, M3 denotes the support vector machine model, M4 denotes the random forest model, M5 denotes the Bayesian regression model and M6 denotes the deep neural networks model.</p>
</caption>
<graphic xlink:href="fgene-13-887643-g004.tif"/>
</fig>
<p>In <xref ref-type="fig" rid="F5">Figure 5</xref>, we compared the performance of seven kernels for the maize data set: Linear, Polynomial, Sigmoid, Gaussian, Exponential, Arc-Cosine_1 and Arc-Cosine_2 for model M4 and M5. For model M5, the best prediction performance was observed under the Arc_cosine_1 and Polynomial kernel and the worst under the Gaussian kernel. While under model M4, the best performance in terms of MSE was observed under the Gaussian Kernel and the worst under the Linear kernel. The code used for implementing model M4 and M5 with the seven kernels are given in <xref ref-type="sec" rid="s15">Supplementary Appendix SE</xref>. It is important to point out that in the SKM library it is possible to perform kernel and sparse kernels not only under the Bayesian BGBLUP method (a sub-model of model M5, that is implemented under a RKHS method in BGLR) but under the six models (M1 to M6) that can be implemented in this library. The kernels apart from one sub-model of model M5 (BGBLUP) are implemented not using as input directly the kernel, but with the square root of the kernel for this reason is possible to be implemented with all the six models. While the sparse kernels were implemented in a similar fashion but using the method explained above, proposed of <xref ref-type="bibr" rid="B6">Cuevas et al. (2020)</xref> and for this reason, also it is possible to be implemented with the six models here evaluated (M1, &#x2026; , M6).</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Prediction performance across environments (Global) in Maize data in terms of Mean Square Error (MSE) of models M4 and M5 for seven kernel methods. M4 denotes the random forest model and M5 denotes the Bayesian regression model.</p>
</caption>
<graphic xlink:href="fgene-13-887643-g005.tif"/>
</fig>
<p>In <xref ref-type="fig" rid="F6">Figure 6</xref>, we also provide the prediction accuracies in terms of MSE for models M4 and M5 with the Arc_cosine_1 kernel for six compression levels (0.5, 0.4, 0.3, 0.2, 0.1 and 0). It is important to point out that in <xref ref-type="fig" rid="F6">Figure 6</xref>, the complement of the compression levels are given on the <italic>x</italic>-axis, which means the proportion of the columns (subsampling of lines without replacement; see <xref ref-type="bibr" rid="B6">Cuevas et al., 2020</xref>) of the complete (dense) kernel that are used as independent variables. We can observe in <xref ref-type="fig" rid="F6">Figure 6</xref> that the best prediction performance for model M5 was obtained with the compression level at 50%, that is, when the model was trained with only half of the total columns of the complete kernel. However, the worst performance in model M5 was with a compression level of 10% (LinesProportion of 0.9). On the other hand, in model M4, the best and worst prediction performance in terms of MSE was observed under compression level of 0.4 (LinesProportion of 0.6) and 0 (LinesProportion of 1) respectively. The R code for reproducing the results given in <xref ref-type="fig" rid="F6">Figure 6</xref> are provided in <xref ref-type="sec" rid="s15">Supplementary Appendix SF</xref>.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Prediction performance across environments (Global) in Maize data in terms of Mean Square Error (MSE) of models M4 and M5 using the sparse Arc_cosine_1 kernel with six proportions of compression levels: 0.5, 0.4, 0.3, 0.2, 0.1 and 0, which correspond to using only the following proportions: 0.5, 0.6, 0.7, 0.8, 0.9 and 1 of the original lines (LinesProportion) for computing the kernels. M4 denotes the random forest model and M5 denotes the Bayesian regression model. The complement of level of compression level is equal to the proportion of lines used to compute the sparse kernel, that is, level of compression &#x3d; 1 minus proportion of lines used to compute the sparse kernel.</p>
</caption>
<graphic xlink:href="fgene-13-887643-g006.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F6">Figure 6</xref> for the Arc_cosine_1 sparse kernel, it is shown that even with the largest compression level, there is not a relevant loss in prediction accuracy. However, when the compression level is larger, less time (in hours) is required for the training process, and the reduction in time of execution is almost linear (<xref ref-type="fig" rid="F7">Figure 7A</xref> for model M4 and <xref ref-type="fig" rid="F7">Figure 7B</xref> for model M5 both for the Arc_cosine_1 sparse kernel). We can also observe in these Figures (<xref ref-type="fig" rid="F7">Figures 7A,B</xref>) that the time required for the training process in model M5 is significantly less than the time required for model M4.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Time in hours for implementing two sparse kernels (Arc_cosine_1 and Gaussian) with the maize data set as a function of the proportion of the compression level (0.5, 0.4, 0.3, 0.2, 0.1 and 0), which corresponds to using only the following proportions: 0.5, 0.6, 0.7, 0.8, 0.9 and 1 of the original lines (LinesProportion) for computing the sparse kernels since level of compression &#x3d; 1 minus proportion of lines used to compute the sparse kernel. <bold>(A)</bold> corresponds to M4 and Arc_cosine_1 sparse kernel. <bold>(B)</bold> corresponds to M5 Arc_cosine_1 sparse kernel. <bold>(C)</bold> corresponds to M4 and Gaussian sparse kernel. <bold>(D)</bold> corresponds to M5 Gaussian sparse kernel. M4 denotes the random forest model and M5 denotes the Bayesian regression model.</p>
</caption>
<graphic xlink:href="fgene-13-887643-g007.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F8">Figure 8</xref> shows the prediction performance in terms of MSE for models M4 and M5 but now with the Gaussian kernel using the same six compression levels (0.5, 0.4, 0.3, 0.2, 0.1 and 0). For model M5, we did not observe any significant loss in terms of prediction performance with the six levels of compression levels evaluated. In model M5, we can observe that the best prediction performance was obtained with the largest compression level (0.5; LinesProportion of 0.5), but between the remaining compression levels we did not observed significant differences. The R code for reproducing the results given in <xref ref-type="fig" rid="F8">Figure 8</xref> are provided in <xref ref-type="sec" rid="s15">Supplementary Appendix SF</xref>.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Prediction performance across environments (Global) in Maize data in terms of Mean Square Error (MSE) of models M4 and M5 using the sparse Gaussian kernel with six proportions of compression levels: 0.5, 0.4, 0.3, 0.2, 0.1 and 0, which correspond to using only the following proportions: 0.5, 0.6, 0.7, 0.8, 0.9 and 1 of the original lines (LinesProportion) for computing the sparse kernels, since level of compression &#x3d; 1 minus proportion of lines used to compute the sparse kernel. M4 denotes the random forest model and M5 denotes the Bayesian regression model.</p>
</caption>
<graphic xlink:href="fgene-13-887643-g008.tif"/>
</fig>
<p>It is observed in <xref ref-type="fig" rid="F8">Figure 8</xref> that even with the large proportion of compression level, we did not experience a significant loss in prediction accuracy. When the compression level was larger, less time was required for the training process (<xref ref-type="fig" rid="F7">Figures 7C,D</xref>). While the trend is not totally linear under model M4 and Gaussian sparse kernel, it is still clear that a significant reduction in time is achieved when the compression level increases. On the other hand, under model M5 with the Gaussian sparse kernel, a linear reduction is observed in the time required for training when the compression level is increased. This is particularly interesting since we can translate into significant savings of computational resources without a significant loss of prediction accuracy. Furthermore, <xref ref-type="fig" rid="F7">Figure 7D</xref> also shows that model M5 requires considerably less time for the training process in comparison to the model M4 (<xref ref-type="fig" rid="F7">Figure 7C</xref>).</p>
<p>The information provided in this <xref ref-type="fig" rid="F7">Figure 7</xref>, illustrates that with the use of sparse kernels it is possible to gain a significant reduction in time for the implementation of the prediction models by means of dense kernels (without any level of compression). For example, <xref ref-type="fig" rid="F7">Figure 7</xref> shows that the larger the level of compression the larger the reduction in computational resources. However, as observed in <xref ref-type="fig" rid="F6">Figures 6</xref>, <xref ref-type="fig" rid="F8">8</xref> caution must be exercised when determining the level of compression, because when this is large the level of accuracy could be negatively affected (will reduce the prediction performance). However, <xref ref-type="fig" rid="F6">Figures 6</xref>, <xref ref-type="fig" rid="F8">8</xref> depicted that even with level of compression of 50% genomic prediction accuracy is not dramatically affected. In general, M4 and M5 with sparse Gaussian kernel enhance the genome-based prediction accuracy of as compared with sparse kernel for all compression levels.</p>
</sec>
</sec>
<sec id="s7">
<title>Default Settings for the Algorithms</title>
<p>The default setting for those algorithms that require a tuning process (M1, M2, M3, M4 and M6) is the &#x201c;Grid_search&#x201d; strategy of tuning, but this only works when you specified at least for one of the hyper-parameters with more than two values to be evaluated, that is, a grid with a least two values for at least one hyperparameter. Also, for the tuning process by default is implemented an inner (nested) K-fold cross validation with K &#x3d; 5 by default. When the Bayesian optimization is selected for the tuning process by default are explored 10 iterations. In <xref ref-type="table" rid="T4">Table 4</xref> are given the default hyperparameters for each or the six models.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Default hyper-parameters for each of the models that can be implemented in the SKM library.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Model</th>
<th align="center">Name</th>
<th align="center">Default Hyper-parameter values</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="3" align="left">M1</td>
<td rowspan="3" align="left">Generalized boosted machines</td>
<td align="left">trees_number &#x3d; 500, max_depth &#x3d; 1</td>
</tr>
<tr>
<td align="left">node_size &#x3d; 10, shrinkage &#x3d; 0.1</td>
</tr>
<tr>
<td align="left">sampled_records_proportion &#x3d; 0.5</td>
</tr>
<tr>
<td align="left">M2</td>
<td align="left">Generalized linear models</td>
<td align="left">alpha &#x3d; 1; with alpha between 0 and 1 (for Elastic net Regression) and alpha &#x3d; 0 for Ridge regression and alpha equal to 1 for Lasso Regression</td>
</tr>
<tr>
<td align="left">M3</td>
<td align="left">Support vector machines</td>
<td align="left">kernel &#x3d; &#x201c;linear&#x201d;, degree &#x3d; 3, gamma &#x3d; 1/NCOL(x), coef0 &#x3d; 0 and cost &#x3d; 1</td>
</tr>
<tr>
<td align="left">M4</td>
<td align="left">Random forest</td>
<td align="left">trees_number &#x3d; 500, node_size &#x3d; 5, node_depth &#x3d; NULL and sampled_x_vars_number &#x3d; NULL</td>
</tr>
<tr>
<td align="left">M5</td>
<td align="left">Bayesian regression models</td>
<td align="left">Not applied since are not required hyperparameters since run with the default values of the BGLR library</td>
</tr>
<tr>
<td rowspan="3" align="left">M6</td>
<td rowspan="3" align="left">Deep neural networks</td>
<td align="left">learning_rate &#x3d; 0.001, epochs_number &#x3d; 500, batch_size &#x3d; 32, layers &#x3d; list (list (neurons_number &#x3d; 50, neurons_proportion &#x3d; NULL, activation &#x3d;</td>
</tr>
<tr>
<td align="left">&#x201c;relu&#x201d;, dropout &#x3d; 0, ridge_penalty &#x3d; 0, lasso_penalty &#x3d; 0)), output_penalties &#x3d; list (ridge_penalty &#x3d; 0, lasso_penalty &#x3d; 0), optimizer &#x3d; &#x201c;adam&#x201d;, shuffle &#x3d; TRUE, early_stop &#x3d; FALSE</td>
</tr>
<tr>
<td align="left">early_stop_patience &#x3d; 50</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec sec-type="discussion" id="s8">
<title>Discussion</title>
<p>Data is playing an unprecedented role in the twenty-first century. For this reason, many companies consider data science as a fundamental component to extract useful knowledge, make better decisions, reduce losses, analyze market trends and increase profits. Likewise, it is playing an essential role in increasing the rate of scientific and technological discoveries. For these reasons, the demand for Data Scientists continues increasing and is expected to grow by 27.9% by 2026, according to the US Bureau of Labor Statistics (<xref ref-type="bibr" rid="B27">Rieley, 2018</xref>). However, to satisfy this growing demand, people with different backgrounds need to be trained in this area rather rapidly. In this vein, more open source and user-friendly software such the SKM library are need to extract useful knowledge more efficiently from raw data. Even though there are many tools for implementing supervised machine learning methods in the R statistical software, they are still insufficient to cover the broad spectrum of needs, as there are many complex tasks that are not covered by existing tools.</p>
<p>For example, our library (SKM), in addition to grid search for hyperparameter tuning, also included the Bayesian optimization method, which is a sequential design strategy for global optimization of black-box functions that does not presume any functional forms. It is generally employed to optimize functions that are expensive to evaluate. Bayesian optimization, contrary to a grid search that performs an exhaustive evaluation over each point of the grid of values given for each hyperparameter, needs very few evaluations as starting points, and based on the knowledge at hand, it can indicate which point should be evaluated next. Bayesian optimization makes these decisions with something called acquisition functions, which are heuristics for how desirable it is to evaluate a point based on our present model. At every step, the Bayesian optimization method determines the best point to evaluate according to the acquisition function by optimizing it (<xref ref-type="bibr" rid="B16">Mockus, 2012</xref>). The model is then updated, and this process is repeated to determine the next point to evaluate.</p>
<p>In order for machine learning algorithms to be able to successfully perform a grid search, very large amount of values for each hyperparameter is required, and as such, this method is frequently rendered impractical since the required computational resources are substantial. For this reason, our library (SKM) is novel since it can be implemented for hyperparameter tuning with the Bayesian optimization algorithm, which is well suited when the function evaluations are expensive.</p>
<p>We do not expect the proposed SKM library to replace libraries like mlr3 and scikit-learn, since these libraries will continue to be suitable options for those who seek a complete solution for a particular machine learning implementation. Nonetheless, our library (SKM) will be a great alternative for its simplicity, as it can be used with six conventional machine learning algorithms with some kernel methods, and thus, help to better capture non-linear patterns in the data.</p>
<p>Additionally, to the best of our knowledge, this is the first library that permits kernels to be implemented with six conventional machine learning methods in a very simple way, which can help increase the prediction performance when the input data contains non-linear patterns. Furthermore, the SKM package permits the implementation of approximate kernels (here called spare kernels), which can help reduce the computational resources for data sets of large dimensions, without a significant reduction in accuracy. In comparison to typical kernels that reduce the input size to the number of observations, sparse kernels can reduce the input size to even less than the number of observations and in this way, save more computational resources for its implementation. It must be noted that since the building process of the kernels is first done in an independent process, this computed kernel can be implemented with any machine leaning method.</p>
<p>While the proposed SKM library only allows multivariate responses for continuous outcomes to be trained under the Bayesian framework and generalized linear models, it also allows multivariate continuous, binary and categorical outcomes to be trained under by the random forest method. Nevertheless, only deep neural networks allows multivariate responses for continuous, binary, categorical and count to be trained. Contrarily, only univariate models can be trained under generalized boosted machines and support vector machines. As we previously stated, the six models can be implemented with seven kernels. These kernels are Linear, Polynomial, Sigmoid, Gaussian, Exponential, Arc-Cosine 1 and Arc-Cosine L (with L &#x3d; 2, 3 &#x2026; ), which is useful for when the dimensionality of the input is larger than the training samples, greater computational resources are needed; however, using any of these kernels reduces the number of training samples which, in turn, reduces the computational resources needed, thus permitting non-linear patterns to be captured more efficiently.</p>
<p>With the illustrative examples provided, the library can implement supervised machine learning methods for binary, categorical, count and continuous response variables, with the advantage that the user does not need to specify the type of response to be implemented; by providing the response variable as a factor, the library will understand whether it will implement a binary or categorical model depending on the number of categories of the response variable. On the other hand, if the response variable is converted to numeric values, the library will implement a count or continuous model.</p>
</sec>
<sec id="s9">
<title>Conclusion and Future Work</title>
<p>This new package will benefit both machine learning practitioners and researchers who want to implement predictive models in a simple way with state-of-the art methods for tuning hyperparameters like Bayesian optimization. We also expect people from different disciplines who are not programming experts to be able to take advantage of the simplicity of SKM to enter into the machine learning world.</p>
<p>The kernelize function in SKM is of special interest since this is the first package that allows kernels to be used with different machine learning algorithms as a new approach of working with complex non-linear and high dimensional data.</p>
<p>This new package is not intended to provide a full data science solution, but rather, new machine learning algorithms can be included in future versions along with more metric functions, model benchmarking, data input and other data science related tools.</p>
<p>With the plant breeding examples provided, we illustrated how this library can implement six machine learning algorithms and seven types of kernel methods in the context of genomic prediction. Moreover, we illustrated that the implementation of sparse kernels can save significant computation resources without a significant loss in prediction accuracy. Finally, in the appendices, we provided all the codes so that users from different backgrounds and areas of interest can easily implement all the models and tools provided in the SKM library.</p>
</sec>
</body>
<back>
<sec id="s10">
<title>Data Availability Statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="sec" rid="s15">Supplementary Material</xref>, further inquiries can be directed to the first author and/or the corresponding authors and can be found in: <ext-link ext-link-type="uri" xlink:href="https://github.com/osval78/SKM_Library_Examples">https://github.com/osval78/SKM_Library_Examples</ext-link>.</p>
</sec>
<sec id="s11">
<title>Author Contributions</title>
<p>OAML, AML, and JC had the original idea and BM and AP assisted in writing the R codes. All the authors participated writing the first version and reviewing several of improved versions.</p>
</sec>
<sec id="s12">
<title>Funding</title>
<p>We are thankful for the financial support provided by the Bill &#x26; Melinda Gates Foundation [INV-003439, BMGF/FCDO, Accelerating Genetic Gains in Maize and Wheat for Improved Livelihoods (AG2MW)], the USAID projects USAID Amend. No. 9 MTO 069033, USAID-CIMMYT Wheat/AGGMW, AGG-Maize Supplementary Project, AGG (Stress Tolerant Maize for Africa), and the CIMMYT CRP (maize and wheat).</p>
</sec>
<sec sec-type="COI-statement" id="s13">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s14">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ack>
<p>We acknowledge the financial support provided by the Foundation for Research Levy on Agricultural Products (FFL) and the Agricultural Agreement Research Fund (JA) in Norway through NFR grant 267806.</p>
</ack>
<sec id="s15">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fgene.2022.887643/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fgene.2022.887643/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.docx" id="SM1" mimetype="application/docx" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Abadi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Agarwal</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Barham</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Brevdo</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Citro</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems</article-title>. <comment>Available: <ext-link ext-link-type="uri" xlink:href="https://www.tensorflow.org/">https://www.tensorflow.org/</ext-link>.</comment> </citation>
</ref>
<ref id="B2">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Allaire</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chollet</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Keras: R Interface to Keras</article-title>
<comment>. <ext-link ext-link-type="uri" xlink:href="https://keras.rstudio.com">https://keras.rstudio.com</ext-link> (Accessed 07 11, 2018)</comment>. </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Campos</surname>
<given-names>G. d. l.</given-names>
</name>
<name>
<surname>Pe&#x301;rez</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Gianola</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Burguen&#x303;o</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Araus</surname>
<given-names>J. L.</given-names>
</name>
<etal/>
</person-group> (<year>2010</year>). <article-title>Prediction of Genetic Values of Quantitative Traits in Plant Breeding Using Pedigree and Molecular Markers</article-title>. <source>Genetics</source> <volume>186</volume> (<issue>2</issue>), <fpage>713</fpage>&#x2013;<lpage>724</lpage>. <pub-id pub-id-type="doi">10.1534/genetics.110.118521</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cuevas</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Montesinos-L&#xf3;pez</surname>
<given-names>O. A.</given-names>
</name>
<name>
<surname>Burgue&#xf1;o</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>P&#xe9;rez-Rodr&#xed;guez</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>de Los Campos</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Bayesian Genomic Prediction with Genotype &#xd7; Environment Interaction Kernel Models Environment Interaction Kernel Models</article-title>. <source>G3 Genes, Genomes, Genet.</source> <volume>7</volume> (<issue>1</issue>), <fpage>41</fpage>&#x2013;<lpage>53</lpage>. <pub-id pub-id-type="doi">10.1534/g3.116.035584</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cuevas</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Soberanis</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>P&#xe9;rez&#x2010;Elizalde</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>P&#xe9;rez&#x2010;Rodr&#xed;guez</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Campos</surname>
<given-names>G. d. l.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Genomic Prediction of Genotype &#xd7; Environment Interaction Kernel Regression Models</article-title>. <source>Plant Genome</source> <volume>9</volume> (<issue>3</issue>), <fpage>1</fpage>&#x2013;<lpage>20</lpage>. <pub-id pub-id-type="doi">10.3835/plantgenome2016.03.0024</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cuevas</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Montesinos-L&#xf3;pez</surname>
<given-names>O. A.</given-names>
</name>
<name>
<surname>Martini</surname>
<given-names>J. W. R.</given-names>
</name>
<name>
<surname>P&#xe9;rez-Rodr&#xed;guez</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Lillemo</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Approximate Genome-Based Kernel Models for Large Data Sets Including Main Effects and Interactions</article-title>. <source>Front. Genet.</source> <volume>11</volume>, <fpage>567757</fpage>. <pub-id pub-id-type="doi">10.3389/fgene.2020.567757</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cuevas</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Montesinos-L&#xf3;pez</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Juliana</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Guzm&#xe1;n</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>P&#xe9;rez-Rodr&#xed;guez</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Gonz&#xe1;lez-Bucio</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Deep Kernel for Genomic and Near Infrared Predictions in Multi-Environment Breeding Trials</article-title>. <source>G3 Genes&#x7c;Genomes&#x7c;Genetics</source> <volume>9</volume> (<issue>9</issue>), <fpage>2913</fpage>&#x2013;<lpage>2924</lpage>. <pub-id pub-id-type="doi">10.1534/g3.119.400493</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Friedman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hastie</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tibshirani</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Regularization Paths for Generalized Linear Models via Coordinate Descent</article-title>. <source>J. Stat. Softw.</source> <volume>33</volume> (<issue>1</issue>), <fpage>1</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.18637/jss.v033.i01</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Greenwell</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Boehmke</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Cunningham</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Developers</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Gbm: Generalized Boosted Regression Models</article-title>. <comment>Available: <ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=gbm">https://CRAN.R-project.org/package&#x3d;gbm</ext-link>.</comment> </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ishwaran</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Kogalur</surname>
<given-names>U. B.</given-names>
</name>
<name>
<surname>Blackstone</surname>
<given-names>E. H.</given-names>
</name>
<name>
<surname>Lauer</surname>
<given-names>M. S.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Random Survival Forests</article-title>. <source>Ann. Appl. Stat.</source> <volume>2</volume> (<issue>3</issue>), <fpage>841</fpage>&#x2013;<lpage>860</lpage>. <pub-id pub-id-type="doi">10.1214/08-aoas169</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Kuhn</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Caret: Classification and Regression Training</article-title>. <comment>R package version 6.0-71. Available: <ext-link ext-link-type="uri" xlink:href="https://github.com/topepo/caret/">https://github.com/topepo/caret/</ext-link>
</comment>. </citation>
</ref>
<ref id="B12">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Kuhn</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wickham</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Tidymodels: A Collection of Packages for Modeling and Machine Learning Using Tidyverse Principles</article-title>. <comment>Available: <ext-link ext-link-type="uri" xlink:href="https://www.tidymodels.org">https://www.tidymodels.org</ext-link>.</comment> </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Binder</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Richter</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Schratz</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pfisterer</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Coors</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>mlr3: A Modern Object-Oriented Machine Learning Framework in R</article-title>. <source>Joss</source> <volume>4</volume> (<issue>44</issue>), <fpage>1903</fpage>. <pub-id pub-id-type="doi">10.21105/joss.01903</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lang</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Checkmate: Fast Argument Checks for Defensive R Programming</article-title>. <source>R J.</source> <volume>9</volume> (<issue>1</issue>), <fpage>437</fpage>&#x2013;<lpage>445</lpage>. <pub-id pub-id-type="doi">10.32614/RJ-2017-028</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Meyer</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Dimitriadou</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Hornik</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Weingessel</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Leisch</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Misc Functions of the Department of Statistics</article-title>. <comment>Probability Theory Group (Formerly: E1071), TU Wien. R package version 1.7- 2 <ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=e1071">https://CRAN.R-project.org/package&#x3d;e1071</ext-link>.</comment> </citation>
</ref>
<ref id="B16">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Mockus</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2012</year>). <source>Bayesian Approach to Global Optimization: Theory and Applications</source>. <publisher-loc>Berlin, Germany</publisher-loc>: <publisher-name>Springer Science &#x26; Business Media</publisher-name>. </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Montesinos&#x2010;L&#xf3;pez</surname>
<given-names>O. A.</given-names>
</name>
<name>
<surname>Montesinos&#x2010;L&#xf3;pez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hernandez&#x2010;Suarez</surname>
<given-names>C. M.</given-names>
</name>
<name>
<surname>Barr&#xf3;n&#x2010;L&#xf3;pez</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021a</year>). <article-title>Deep&#x2010;learning Power and Perspectives for Genomic Selection</article-title>. <source>Plant Genome</source> <volume>14</volume> (<issue>3</issue>), <fpage>e20122</fpage>. <pub-id pub-id-type="doi">10.1002/tpg2.20122</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Montesinos-L&#xf3;pez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Montesinos-L&#xf3;pez</surname>
<given-names>O. A.</given-names>
</name>
<name>
<surname>Montesinos-L&#xf3;pez</surname>
<given-names>J. C.</given-names>
</name>
<name>
<surname>Flores-Cortes</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>de la Rosa</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021b</year>). <article-title>A Guide for Kernel Generalized Regression Methods for Genomic-Enabled Prediction</article-title>. <source>Heredity</source> <volume>126</volume> (<issue>4</issue>), <fpage>577</fpage>&#x2013;<lpage>596</lpage>. <pub-id pub-id-type="doi">10.1038/s41437-021-00412-1</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Montesinos-L&#xf3;pez</surname>
<given-names>O. A.</given-names>
</name>
<name>
<surname>Montesinos-L&#xf3;pez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022b</year>). &#x201c;<article-title>Overfitting, Model Tuning and Evaluation of Prediction Performance</article-title>,&#x201d; in <source>Multivariate Statistical Machine Learning Methods for Genomic Prediction</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Montesinos L&#xf3;pez</surname>
<given-names>O.A.</given-names>
</name>
<name>
<surname>Montesinos L&#xf3;pez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
</person-group> (<publisher-loc>Cham, Switzerland</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <volume>2022</volume>, <fpage>109</fpage>&#x2013;<lpage>139</lpage>. </citation>
</ref>
<ref id="B20">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Montesinos-L&#xf3;pez</surname>
<given-names>O. A.</given-names>
</name>
<name>
<surname>Montesinos-L&#xf3;pez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022a</year>). &#x201c;<article-title>Reproducing Kernel Hilbert Spaces Regression and Classification Methods</article-title>,&#x201d; in <source>Multivariate Statistical Machine Learning Methods for Genomic Prediction</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Montesinos L&#xf3;pez</surname>
<given-names>O.A.</given-names>
</name>
<name>
<surname>Montesinos L&#xf3;pez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
</person-group> (<publisher-loc>Cham, Switzerland</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <volume>2022</volume>, <fpage>251</fpage>&#x2013;<lpage>336</lpage>. </citation>
</ref>
<ref id="B21">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Osborne</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Garnett</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Roberts</surname>
<given-names>S. J.</given-names>
</name>
</person-group> (<year>2009</year>). <source>Gaussian Processes for Global Optimization</source>. <publisher-loc>Oxford, UK</publisher-loc>: <publisher-name>Learning and Intelligent Optimization</publisher-name>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ott</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Palm</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Vogt</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Oberprieler</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>GinJinn: An Object&#x2010;detection Pipeline for Automated Feature Extraction from Herbarium Specimens</article-title>. <source>Appl. Plant Sci.</source> <volume>8</volume> (<issue>6</issue>), <fpage>e11351</fpage>. <pub-id pub-id-type="doi">10.1002/aps3.11351</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="book">
<collab>Pandas development team</collab> (<year>2020</year>). <source>Pandas-dev/pandas</source>. <publisher-name>Pandas. Zenodo</publisher-name>. <pub-id pub-id-type="doi">10.5281/zenodo.3509134</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pedregosa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Gael Varoquaux</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gramfort</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Michel</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Thirion</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Olivier Grisel</surname>
<given-names>O.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Scikit-learn: Machine Learning in python</article-title>. <source>Mach. Learn. PYTHON</source> <volume>12</volume>, <fpage>2825</fpage>&#x2013;<lpage>2830</lpage>. </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>P&#xe9;rez</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>de los Campos</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Genome-wide Regression and Prediction with the Bglr Statistical Package</article-title>. <source>Genetics</source> <volume>198</volume> (<issue>2</issue>), <fpage>483</fpage>&#x2013;<lpage>495</lpage>. <pub-id pub-id-type="doi">10.1534/genetics.114.164442</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="book">
<collab>R Core Team</collab> (<year>2021</year>). <source>R: A Language and Environment for Statistical Computing</source>. <publisher-loc>Vienna, Austria</publisher-loc>: <publisher-name>R Foundation for Statistical Computing</publisher-name>. <comment>Available: <ext-link ext-link-type="uri" xlink:href="https://www.R-project.org/">https://www.R-project.org/</ext-link>.</comment> </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rieley</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Big Data Adds up to Opportunities in Math Careers</article-title>. <source>Beyond Numbers Employ. Unempl.</source> <volume>7</volume> (<issue>8</issue>). <ext-link ext-link-type="uri" xlink:href="https://www.bls.gov/opub/btn/volume-7/big-data-adds-up.htm">https://www.bls.gov/opub/btn/volume-7/big-data-adds-up.htm</ext-link>. </citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shahin</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Nassif</surname>
<given-names>A. B.</given-names>
</name>
<name>
<surname>Hamsa</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Novel Cascaded Gaussian Mixture Model-Deep Neural Network Classifier for Speaker Identification in Emotional Talking Environments</article-title>. <source>Neural Comput. Applic</source> <volume>32</volume> (<issue>7</issue>), <fpage>2575</fpage>&#x2013;<lpage>2587</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-018-3760-2</pub-id> </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Souza</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Cuevas</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>de Oliveira</surname>
<given-names>C. E. G.</given-names>
</name>
<name>
<surname>P&#xe9;rez-Rodr&#xed;guez</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Jarqu&#xed;n</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Fritsche-Neto</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Genomic-Enabled Prediction in Maize Using Kernel Models with Genotype &#xd7; Environment Interaction</article-title>. <source>G3 (Bethesda)</source> <volume>7</volume> (<issue>6</issue>), <fpage>1995</fpage>&#x2013;<lpage>2014</lpage>. <pub-id pub-id-type="doi">10.1534/g3.117.042341</pub-id> </citation>
</ref>
<ref id="B30">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Aung</surname>
<given-names>M. S. H.</given-names>
</name>
<name>
<surname>Abdullah</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Brian</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Campbell</surname>
<given-names>A. T.</given-names>
</name>
<name>
<surname>Choudhury</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). &#x201c;<article-title>CrossCheck</article-title>,&#x201d; in <conf-name>UbiComp 2016 - Proceedings of the 2016 ACM International Joint Conference on Pervasive and Ubiquitous Computing</conf-name>, <conf-loc>Heidelberg, Germany</conf-loc>, <conf-date>September 12-16, 2016</conf-date> (<publisher-name>Association for Computing Machinery, Inc</publisher-name>), <fpage>886</fpage>&#x2013;<lpage>897</lpage>. <pub-id pub-id-type="doi">10.1145/2971648.2971740</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wickham</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Averick</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bryan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>McGowan</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Fran&#xe7;ois</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Welcome to the Tidyverse</article-title>. <source>Joss</source> <volume>4</volume> (<issue>43</issue>), <fpage>1686</fpage>. <pub-id pub-id-type="doi">10.21105/joss.01686</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Wickham</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Fran&#xe7;ois</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Henry</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>M&#xfc;ller</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Dplyr: A Grammar of Data Manipulation</article-title>. <comment>R Package Version 0.4.3 <ext-link ext-link-type="uri" xlink:href="http://CRAN.R-project.org/package=dplyr">http://CRAN.R-project.org/package&#x3d;dplyr</ext-link>.</comment> </citation>
</ref>
<ref id="B33">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Williams</surname>
<given-names>C. K. I.</given-names>
</name>
<name>
<surname>Seeger</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2001</year>). &#x201c;<article-title>Using the Nystr&#xf6;m Method to Speed up Kernel Machines</article-title>,&#x201d; in <source>Advances in Neural Information Processing Systems</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Leen</surname>
<given-names>T.K.</given-names>
</name>
<name>
<surname>Diettrich</surname>
<given-names>T.G.</given-names>
</name>
<name>
<surname>Tresp</surname>
<given-names>V.</given-names>
</name>
</person-group> (<publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>MIT Press</publisher-name>), <volume>13</volume>, <fpage>682</fpage>&#x2013;<lpage>688</lpage>. </citation>
</ref>
<ref id="B34">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Yan</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>RBayesianOptimization: Bayesian Optimization of Hyperparameters</article-title>. <comment>R package version 1.1.0. [Online]. Available: <ext-link ext-link-type="uri" xlink:href="https://CRAN.R-project.org/package=rBayesianOptimization">https://CRAN.R-project.org/package&#x3d;rBayesianOptimization</ext-link>.</comment> </citation>
</ref>
</ref-list>
</back>
</article>