<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">559617</article-id>
<article-id pub-id-type="doi">10.3389/frai.2020.559617</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Artificial Intelligence</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>An Interpretable Predictive Model of Vaccine Utilization for Tanzania</article-title>
<alt-title alt-title-type="left-running-head">Hariharan et al.</alt-title>
<alt-title alt-title-type="right-running-head">Vaccine Utilization Forecasting</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Hariharan</surname>
<given-names>Ramkumar</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">
<sup>&#x2a;</sup>
</xref>
<uri xlink:href="http://loop.frontiersin.org/people/972167/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sundberg</surname>
<given-names>Johnna</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gallino</surname>
<given-names>Giacomo</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="http://loop.frontiersin.org/people/1025114/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Schmidt</surname>
<given-names>Ashley</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Arenth</surname>
<given-names>Drew</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="http://loop.frontiersin.org/people/1078272/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Sra</surname>
<given-names>Suvrit</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="corresp" rid="c001">
<sup>&#x2a;</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Fels</surname>
<given-names>Benjamin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">
<sup>&#x2a;</sup>
</xref>
<uri xlink:href="http://loop.frontiersin.org/people/931688/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<label>
<sup>1</sup>
</label>Macro-Eyes, Inc, <addr-line>Seattle</addr-line>, <addr-line>WA</addr-line>, <country>United States</country>
</aff>
<aff id="aff2">
<label>
<sup>2</sup>
</label>College of Engineering, <institution>Northeastern University</institution>, <addr-line>Seattle</addr-line>, <addr-line>WA</addr-line>, <country>United States</country>
</aff>
<aff id="aff3">
<label>
<sup>3</sup>
</label>Laboratory for Information and Decision Systems, Department of Electrical Engineering and Computer Science, MIT, <addr-line>MA</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/740103/overview">Wojciech Samek</ext-link>, Heinrich Hertz Institute (FHG), Germany</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/172729/overview">Riccardo Zese</ext-link>, University of Ferrara, Italy</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1059274/overview">Abdulrazak Yahya Saleh</ext-link>, Universiti Malaysia Sarawak, Malaysia</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Ramkumar Hariharan, <email>ram@macro-eyes.com</email>; Suvrit Sra, <email>suvrit@macro-eyes.com</email>, Benjamin Fels, <email>benjamin@macro-eyes.com</email>
</corresp>
<fn>
<p>This article was submitted to Machine Learning and Artificial Intelligence, a section of the journal Frontiers in Artificial Intelligence</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>30</day>
<month>10</month>
<year>2020</year>
</pub-date>
<pub-date pub-type="collection">
<year>2020</year>
</pub-date>
<volume>3</volume>
<elocation-id>559617</elocation-id>
<history>
<date date-type="received">
<day>06</day>
<month>05</month>
<year>2020</year>
</date>
<date date-type="accepted">
<day>28</day>
<month>09</month>
<year>2020</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2020 Hariharan, Sundberg, Gallino, Schmidt, Arenth, Sra and Fels.</copyright-statement>
<copyright-year>2020</copyright-year>
<copyright-holder>Hariharan, Sundberg, Gallino, Schmidt, Arenth, Sra and Fels</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Providing accurate utilization forecasts is key to maintaining optimal vaccine stocks in any health facility. Current approaches to vaccine utilization forecasting are based on often outdated population census data, and rely on weak, low-dimensional demand forecasting models. Further, these models provide very little insights into factors that influence vaccine utilization. Here, we built a state-of-the-art, machine learning model using novel, temporally and regionally relevant vaccine utilization data. This highly multidimensional machine learning approach accurately predicted bi-weekly vaccine utilization at the individual health facility level. Specifically, we achieved a forecasting fraction error of less than two for about 45% of regional health facilities in both the Tanzania regions analyzed. Our &#x201c;random forest regressor&#x201d; had an average forecasting fraction error that was almost 18 times less compared to the existing system. Importantly, using our model, we gleaned several key insights into factors underlying utilization forecasts. This work serves as an important starting point to reimagining predictive health systems in the developing world by leveraging the power of Artificial Intelligence and big data.</p>
</abstract>
<kwd-group>
<kwd>machine learning</kwd>
<kwd>forecasting</kwd>
<kwd>artificial intelligence</kwd>
<kwd>random forests</kwd>
<kwd>vaccine</kwd>
</kwd-group>
<contract-sponsor id="cn001">Bill and Melinda Gates Foundation<named-content content-type="fundref-id">10.13039/100000865</named-content>
</contract-sponsor>
<counts>
<page-count count="0"/>
</counts>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>Vaccines have been touted as the &#x201c;single most life-saving healthcare innovation ever&#x201d; (Orenstein and Ahmed, 2017). It has also been emphasized that vaccination and not vaccines save lives (<xref ref-type="bibr" rid="B1">Breiman and Friedman, 1984</xref>). Additionally, a recent study on 94 low- and middle-income countries estimated that a $34 billion investment in immunization programs resulted in savings of $1.53 trillion in broad illness-related economic benefits (<xref ref-type="bibr" rid="B11">Ozawa et al., 2016</xref>). Maximizing immunization coverage for any population is an important public health goal for all countries and 194 Member States of the World Health Assembly in May 2012 agree, having developed a framework to prevent millions of deaths by 2020 through more equitable access to existing vaccines for people in all communities (WHO, Global Vaccine Action Plan 2012&#x2013;2020).</p>
<p>One of the challenges that countries need to overcome to move closer to this goal is accurate forecasting of vaccine utilization (<xref ref-type="bibr" rid="B9">Meuller et al., 2016</xref>). Under-estimation of vaccine demand can lead to reduced coverage and vaccine stock-outs while over estimation leads to vaccine wastage (<xref ref-type="bibr" rid="B9">Meuller et al., 2016</xref>). The majority of existing vaccine utilization forecasting systems fall into one of two broad categories: 1) Routine data collection such as data on immunization and/or stock level changes entered by health workers (<xref ref-type="bibr" rid="B7">Logistimo, 2011</xref>) and past trends detected from immunization and/or stock level change data extrapolated to forecast future utilization; and 2) Population level data using population level survey data on pregnant women and child births (<xref ref-type="bibr" rid="B20">John Snow, Inc. and USAID, 2010</xref>) extrapolated based on an age-based vaccination scheme and then used to calculate utilization. A recent study had also used a discrete event simulation model to predict the effect of introducing a demand forecasting system into a low-income country&#x2019;s supply chain (<xref ref-type="bibr" rid="B9">Mueller et al., 2016</xref>).</p>
<p>However, as pointed out by multiple studies, existing vaccine utilization forecasting systems are far from perfect and have large room for improvement (<xref ref-type="bibr" rid="B12">Patel et al., 2015</xref>; <xref ref-type="bibr" rid="B8">Lydon et al., 2017</xref>, <xref ref-type="bibr" rid="B13">Path and World Health Organization report, 2011</xref>). These significant inaccuracies in forecasting vaccine utilization may stem from data inaccuracies. For example, the error inherent in attempting to extrapolate population census data which is often vastly outdated. Also, leading to inaccuracies in forecasting vaccine utilization is the use of univariate forecasting models that do not take into account the multivariate nature of the problem inherent in its scope. For example, vaccine utilization at any given health facility is driven by factors apart from the catchment population. Factors may include the characteristics of the facility such as ease of access as reflected in its geo-coordinates, type of facility such as private or public, altitude, etc. These two challenges can be addressed by 1) use of actual vaccine utilization data for each health clinic as this data is very close to ground reality and is recent; and 2) building and applying a multivariate machine learning approach that not only uses vaccine utilization data, but also leverages other kinds of data about the health facility and population to predict vaccine utilization.</p>
<p>Here, we used recent vaccine utilization data, together with publicly available highly multivariate health facility data to forecast individual health facility level vaccine utilization in two Tanzania regions. Importantly, to accurately forecast vaccine utilization, we trained and applied a powerful machine learning model: a Random Forest Regressor (RFR). Our approach not only gives accurate vaccine utilization forecasts, but also provides insights into the data itself. Our findings have clinical and global health program relevance because accurate forecasting of utilization down to facility level will serve to reduce vaccine wastage and stock-outs, in turn contributing to optimal vaccine deployment and the most efficient use of resources.</p>
<p>To the best of our knowledge, this paper is the first to leverage regionally and temporally relevant utilization data together with a host of other features to forecast vaccine utilization.</p>
<p>We worked with the Ministry of Health of Tanzania and an NGO partner, PATH, with experience in the region and specifically vaccines to pilot our approach in three regions of Tanzania.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>Materials and Methods</title>
<p>We used Python v3.6 (<xref ref-type="bibr" rid="B15">Python Software Foundation, 2018</xref>), and Jupyter notebook v5.4.0 (<xref ref-type="bibr" rid="B4">Perez and Granger, 2007</xref>) for all calculations and plots. Specifically, for data pre-processing, we used the fastai library (v1.0, <xref ref-type="bibr" rid="B6">Howard and Gugger, 2020</xref>), whereas for machine learning and model performance evaluation, we relied on scikit-learn v0.21 (<xref ref-type="bibr" rid="B14">Pedregosa et al., 2011</xref>). We used seaborn and matplotlib for generating plots of data (<xref ref-type="bibr" rid="B17">Seaborn, Matplotlib</xref>).</p>
<sec id="s2-1">
<title>Data Gathering and Feature Augmentation</title>
<p>We used a diverse set of features or variables to forecast vaccine utilization. We obtained daily vaccine utilization data from 710 health facilities across three different regions in Tanzania&#x2014;Arusha, Tanga and Kilimanjaro. The data had 13 features for every observation. We split the datetime feature into 12 features using the fastai add-datepart function (<xref ref-type="bibr" rid="B6">Howard and Gugger, 2020</xref>). While these features can partially contribute to predicting vaccine consumption, we believed that analyzing additional features describing individual facilities and their catchment populations would improve predictive performance. Therefore, we examined the Tanzania Health Facility Registry (THFR, see website reference) and the Tanzania National Bureau of Statistics (TNBS, see website reference) to extract several additional dimensions of data to augment our feature set. We also used a web-based tool (<xref ref-type="bibr" rid="B5">GPS visualizer</xref>, see website reference) to add elevation to each health facility. The data we used for building predictive models included several new features such as geo-coordinates, distance to nearest facility, type of facility and regional population (See <xref ref-type="table" rid="T1">Table 1</xref> for a comprehensive list of features). Additionally, since recent vaccine utilization can serve as a useful feature, we used a rolling 3-month average as a feature. This resulted in a total of 32 features which went into our machine learning model. These features encompassed key intuitive vaccine utilization determinants &#x2014; 1) Details of the nature of the vaccine, 2) details of each health facility and 3) features related to the catchment population around each facility.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>List of features used for predicting vaccine utilization in three regions of Tanzania.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>Sl No</th>
<th align="center">Name of feature</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td align="left">Facility ID<sup>1</sup>
</td>
</tr>
<tr>
<td>3</td>
<td align="left">Region<sup>1</sup>
</td>
</tr>
<tr>
<td>4</td>
<td align="left">District<sup>1</sup>
</td>
</tr>
<tr>
<td>5</td>
<td align="left">Ward<sup>1</sup>
</td>
</tr>
<tr>
<td>6</td>
<td align="left">Village/street<sup>1</sup>
</td>
</tr>
<tr>
<td>7</td>
<td align="left">Transaction description<sup>1</sup>
</td>
</tr>
<tr>
<td>8</td>
<td align="left">Vaccine type<sup>1</sup>
</td>
</tr>
<tr>
<td>9</td>
<td align="left">Change in stock<sup>1</sup>
</td>
</tr>
<tr>
<td>10</td>
<td align="left">Reason for change<sup>1</sup>
</td>
</tr>
<tr>
<td>11</td>
<td align="left">Immunization type<sup>1</sup>
</td>
</tr>
<tr>
<td>12</td>
<td align="left">Expiry<sup>1</sup>
</td>
</tr>
<tr>
<td>13</td>
<td align="left">Vaccine manufacturer<sup>1</sup>
</td>
</tr>
<tr>
<td>14&#x2013;25</td>
<td align="left">12 features derived from immunization date (&#x201c;year&#x201d;, &#x201c;month&#x201d;, &#x201c;week&#x201d;, &#x201c;day&#x201d;, &#x201c;dayofweek&#x201d;, &#x201c;dayofyear&#x201d;, &#x201c;Is_month_end&#x201d;, &#x201c;Is_month_start&#x201d;<break/>&#x201c;Is_quarter_end&#x201d;, &#x201c;Is_quarter_start&#x201d;, &#x201c;Is_year_end&#x201d;, &#x201c;Is_year_start&#x201d;)&#x2a;</td>
</tr>
<tr>
<td>26</td>
<td align="left">Geo-coordinate I: Latitude<sup>2</sup>
</td>
</tr>
<tr>
<td>27</td>
<td align="left">Geo-coordinate II: Longitude<sup>2</sup>
</td>
</tr>
<tr>
<td>28</td>
<td align="left">Geo-coordinate III: Elevation<sup>2</sup>
</td>
</tr>
<tr>
<td>29</td>
<td align="left">Total regional population<sup>2</sup>
</td>
</tr>
<tr>
<td>30</td>
<td align="left">Type of facility<sup>2</sup>
</td>
</tr>
<tr>
<td>31</td>
<td align="left">Ownership<sup>2</sup>
</td>
</tr>
<tr>
<td>32</td>
<td align="left">Average utilization from a three-month rolling average calculation</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>
<xref ref-type="table" rid="T1">
<bold>Table 1</bold>
</xref> is a comprehensive list of all 32 features we used for building the predictive model. Facility refers to a health facility. 1 indicates the data for the feature was obtained through PATH and Tanzania MoH. 2 indicates the data for the feature was obtained from other sources. Here, &#x201c;&#x2a;&#x201d; indicates the vaccination date was split into 10 columns using the fastai&#x2019;s add_datepart function (<xref ref-type="bibr" rid="B18">Tanzania Health Ministry Registry</xref> and <xref ref-type="bibr" rid="B19">Tanzania National Bureau of Statistics</xref>).</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s2-2">
<title>Data Preprocessing</title>
<p>We used fastai modules&#x2014;train_cats, add_datepart, and proc_df for initial data pre-processing (<xref ref-type="bibr" rid="B6">Howard and Gugger, 2020</xref>). Specifically, we carried out the following steps &#x2014; 1) Assessing fraction null values for each column. No column had more than 4% null values, 2) For categorical variables, null values were treated as a separate level, and imputed cells were recorded in a separate variable, 3) Median values were used to impute missing continuous variables, 4) Date field was split into 12 separate fields such as time elapsed from the start of the year, is date start of month, etc. All date fields are documented elsewhere (fast.ai, see <xref ref-type="bibr" rid="B6">Howard and Gugger, 2020</xref>), and 5) Categorical mapping rules and imputation value to feature mappings were stored in a dictionary which was re-used for pre-processing test data.</p>
</sec>
<sec id="s2-3">
<title>Data Ordering and Partitioning</title>
<p>We aggregated data per facility and vaccine type into biweekly utilization, which we attempted to forecast. The decision to forecast biweekly utilization was based on discussions with healthcare providers in Tanzania (internal communication). Additionally, summarizing the data into monthly or bimonthly rows does not give us sufficient data size to make robustly extrapolatable forecasts. A factor that led to this decision was the frequency of power outages in those regions. We trained, tuned, and evaluated an RFR to forecast biweekly vaccine utilization at a given health facility in Arusha, Tanga, and Kilimanjaro.</p>
<p>Further, approximately 70% of the data was used to train the model. We equally partitioned the remaining data into validation and test sets. The validation data was used for hyper-parameter tuning whereas the test set was used to report final performance scores. Since the goal was to predict future vaccine utilization, train-validation-test split was done in a temporally sorted manner making sure that the hold-out sets contained only data from dates that were in the future relative to the training, or the validation data.</p>
</sec>
<sec id="s2-4">
<title>Measures of Model Performance and Optimization Function</title>
<p>We used two different measures to evaluate model performance, 1) Root Mean Square Error (RMSE) between predicted and actual biweekly utilization values, and 2) Fraction error (F. E), to measure how far each predicted target variable value was from the actual value of that variable. We also used F. E as the optimization function for our model. We define F. E as<disp-formula id="equ1">
<mml:math>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>.</mml:mo>
<mml:mo>&#xa0;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>.</mml:mo>
<mml:mi>U</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mo>.</mml:mo>
<mml:mi>U</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#xf7;</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>.</mml:mo>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>Where AU is actual biweekly utilization and P.U is predicted bimonthly utilization.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>Results</title>
<sec id="s3-1">
<title>Model Selection</title>
<p>We implemented a wide range of commonly used machine learning algorithms to select the best model. Among all the algorithms&#x2014;regularized linear regression, support vector machine, k-nearest neighbors, RFR and autoregression based univariate time series models, random forests had the best performance on the validation set (<xref ref-type="table" rid="T2">Table 2</xref>). Our exploratory model building also included multilayered neural networks, including recurrent neural nets; however, the models 1) failed to yield comparable performances to some of the other classic machine learning models and, 2) gave largely uninterpretable predictions with no simple way to find feature importances. This bolstered our choice of RFR for vaccine utilization forecasting.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Performances of different machine learning (ML) algorithms on our dataset.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>Sl no</th>
<th align="center">ML algorithm</th>
<th align="center">RMSE</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td align="left">Random forest</td>
<td align="char" char=".">17.0</td>
</tr>
<tr>
<td>2</td>
<td align="left">Gradient boosting</td>
<td align="char" char=".">17.2</td>
</tr>
<tr>
<td>3</td>
<td align="left">K- nearest neighbors</td>
<td align="char" char=".">19.1</td>
</tr>
<tr>
<td>4</td>
<td align="left">Elasticnet</td>
<td align="char" char=".">19.3</td>
</tr>
<tr>
<td>5</td>
<td align="left">Support vector regression</td>
<td align="char" char=".">22.4</td>
</tr>
<tr>
<td>6</td>
<td align="left">ARIMA</td>
<td align="char" char=".">24.7</td>
</tr>
<tr>
<td>7</td>
<td align="left">Neural net (3 layered)</td>
<td align="char" char=".">26.5</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>
<xref ref-type="table" rid="T2">
<bold>Table 2</bold>
</xref> Performance, based on RMSE, of different hyper-parameter tuned machine learning models on the validation set.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3-2">
<title>Using an Random Forest Regressor to Forecast Bi-Weekly Vaccine Utilization in Tanzania</title>
<p>We trained, tuned, and evaluated an RFR to forecast biweekly vaccine utilization at health facilities in Arusha, and Tanga. We chose RFR for further downstream analyses because, 1) As measured by the Root Mean Square Error (RMSE) RFR outperformed all five of the most commonly used machine learning algorithms and, 2) RFR is a powerful and generalizable predictive framework which can also be leveraged to understand the data better.</p>
<p>We used the validation data to set the values for 1) number of estimators, 2) max features, and 3) min samples leaf. Further, we optimized the model to obtain a minimal validation set RMSE. Our optimized values for these hyperparameters were 40, 0.95, and 7, respectively. We also set n_jobs &#x3d; &#x2212;1 in scikit-learn, to effectively utilize all available compute cores.</p>
<p>We made two different versions of our RFR (<xref ref-type="bibr" rid="B2">Leo Brieman, 2001</xref>) &#x2193; model I which uses all 32 features and model II which uses all 32 features except the 3-months rolling average. The two models are both based on random forests but differ in the number of features. Importantly, recent vaccine utilization averages are not used for forecasting in model II.</p>
</sec>
<sec id="s3-3">
<title>Overall Performance of Our Models at Various Levels of F. E Threshold</title>
<p>Model I. We picked this optimized RFR and achieved an F. E of less than 0.2 for a significant subset of facilities, that is, an error of &#xb1;2 doses where actual vaccine utilization was 10 doses (<xref ref-type="fig" rid="F1">Figure 1</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>For model I, the percentage of health facilities in Tanzania meeting F. E thresholds. For each of the six vaccines, our model performances across facilities at different F. E thresholds.</p>
</caption>
<graphic xlink:href="frai-03-559617-g001.tif"/>
</fig>
<p>Further, we found that for the majority of health facilities where our predictive performance passed a given threshold for one vaccine, it also passed the same threshold for the other vaccine as well. We were thus able to predict bi-weekly vaccine utilization within an F. E &#x3c; 0.1 for 22&#x2013;27% of the facilities depending on the vaccine (<xref ref-type="fig" rid="F2">Figure 2</xref>). Our model performed best for rotavirus vaccine utilization, with almost 50% of facilities approaching a F. E &#x3c; 0.2.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>F. E for vaccine types across facilities in Tanzania based on model I. Values were derived from prediction of bi-weekly vaccine doses applying our random forest regressor. Facility ID annotations on the <italic>x</italic>-axis are sparse owing to space limitations.</p>
</caption>
<graphic xlink:href="frai-03-559617-g002.tif"/>
</fig>
<p>Model II. This version of the model is more generalizable since all its features can be derived from publicly available resources. It has an F. E &#x3c; 0.2 for 15&#x2013;20% of facilities (<xref ref-type="sec" rid="s9">Supplementary Figure S1</xref>). Vaccine type was identified as the most important feature for this model (<xref ref-type="sec" rid="s9">Supplementary Figure S2</xref>).</p>
</sec>
<sec id="s3-4">
<title>Feature Importance Using Random Forest Regressor</title>
<p>We used the &#x201c;mean decrease accuracy&#x201d; method to calculate importance scores for all features in our model. This involved randomly permuting each column of data, and then calculating the decrease in <italic>R</italic>
<sup>2</sup> on the out of the box datasets (<xref ref-type="bibr" rid="B2">Brieman and Friedman. 1984</xref>). This feature importance scoring scheme, as implemented in scikit-learn, outputs relative feature importance scores.</p>
<p>Model I. By far, the three-months utilization rolling average has the greatest impact on the model prediction. This is followed by the time, relative to year beginning, to vaccination. A number of features that relate to the health facility come next in our model&#x2014;public or private, GIS coordinates, district, and ward. We hypothesize that features related to the facility implicitly encode demand characteristics of the catchment population (<xref ref-type="fig" rid="F3">Figure 3</xref>).</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Feature importance for our RF regressor underlying model I was computed by &#x201c;jumbling up&#x201d; the data within each feature (&#x3d; column) and calculating the change in <italic>R</italic>
<sup>2</sup>.</p>
</caption>
<graphic xlink:href="frai-03-559617-g003.tif"/>
</fig>
<p>Model II. Vaccine type was identified as the most important feature for this model (<xref ref-type="sec" rid="s9">Supplementary Figure S3</xref>). Again, a number of features that relate to the health facility showed up next.</p>
<p>We made sure our model performance evaluation included completely non-overlapping &#x201c;future&#x201d; data. We included actual geo-coordinates and altitude to remove subjective levels like &#x201c;High altitude&#x201d; or &#x201c;Low altitude&#x201d;. Removing the geo-coordinates still gives us a meaningful model with only a small (0.7) decrease in forecasting accuracy. This is hardly surprising since the relative feature importance based on the Random Forest Regressor falls off rather sharply after feature &#x23;3 &#x201c;Births&#x201d;.</p>
<p>We hypothesize that features related to the facility implicitly encode demand characteristics of the catchment population.</p>
</sec>
<sec id="s3-5">
<title>Comparisons With Existing Model</title>
<p>In order to get a measure of vaccine utilization forecasts based on the existing system, we mined stock addition data. We did not find evidence of any statistical forecasting model. The amount of new stock additions immediately prior to our validation and test dates, were treated as &#x201c;forecasts&#x201d; made by the baseline, existing system. Since we had actual utilization data for the validation and test period, we were able to calculate F. E and RMSE for the existing system. Evaluation of RFR model performance against this baseline was made on the basis of F. E and RMSE (see <xref ref-type="table" rid="T3">Table 3</xref>).</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Comparison of predictive performance and benchmarking.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>Sl no</th>
<th align="center">Predictive model or system</th>
<th align="center">Avg RMSE</th>
<th align="center">Avg F. E</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td align="center">Existing system</td>
<td align="center">351</td>
<td align="char" char=".">43.02</td>
</tr>
<tr>
<td>2</td>
<td align="center">RFR model I</td>
<td align="center">17.90</td>
<td align="char" char=".">1.56</td>
</tr>
<tr>
<td>3</td>
<td align="center">RFR model II</td>
<td align="center">19.00</td>
<td align="char" char=".">2.42</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>
<xref ref-type="table" rid="T3">
<bold>Table 3</bold>
</xref> summarizes benchmarking of the two RFR models against the existing system. RFR model 1 includes 3&#xa0;months moving average as a feature while model 2 does not.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The Global Alliance for Vaccines and Immunization anticipates a six-fold increase from 2010 to 2020 in the number of vaccine doses given to complete immunization. Global coverage for basic childhood vaccines has reached a record 86%, but there has been a parallel increase in vaccine wastage, decreasing resource efficiency. Vaccine stock-outs compound the problem by wasting opportunities for immunization. There is no good reason why the correlation between higher rates of immunization and supply chain waste should persist. Further, wastage of all kinds is largely a result of inaccurate, univariate or static models of vaccine demand assumptions; in summation the wrong quantity and type of vaccines at the wrong time. There is thus an enormous economic incentive to reduce vaccine wastage and stock-outs, without sacrificing high immunization coverage rates.</p>
<p>Random Forests (<xref ref-type="bibr" rid="B2">Leo Breiman, 2001</xref>) is an algorithm that uses bootstrap to sample multiple data observations or rows from the original data, builds decision trees for each bootstrap sample, then integrates predictions of multiple decision trees, and finally uses majority vote or averaging to arrive at final predictions. The RFR) is conceptualized as a strong predictor combining a bunch of weak predictors.</p>
<p>To our knowledge there has only been one paper where machine learning has been applied to predict vaccine utilization and/or demand (<xref ref-type="bibr" rid="B3">Fruggiero et al., 2012</xref>). In that study, the researchers used a combination of autoregressive integrated moving average model and neural networks to forecast annual MMR demand in Taipei County, Taiwan. Specifically, the authors use 10 features to build a decision model, using data related to vaccine demand relative variables and population growth relative variables. Our study differs from theirs in several respects&#x2014; 1) their goal is to forecast annual demand whereas we aim at forecasting bi-weekly demand, 2) they aim at forecasting demand using variables related to population or stock. No features related to health facility location or the facility itself were included. Our model is significantly more comprehensive, as it includes many granular details of the health facilities including their geo-location, altitude, facility details and 3) they aim at forecasting county wide demand whereas we forecast health facility level demand. Our model is therefore, significantly more fine grained.</p>
<p>Vaccine campaigns sometimes deliver vaccines directly or serve to increase demand. Here, we did not have information on vaccine campaign data. In the next round of data gathering, we may be able to access that data and build an increasingly multidimensional, and more accurate model.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s4">
<title>Conclusion</title>
<p>In summary, we present for the first time an interpretable predictive model to forecast vaccine utilization that has a broad scope and can be adapted to many countries and regions. Our study underscores the importance of applying machine learning on hard-to-gather, temporally and spatially relevant integrative datasets to make accurate vaccine utilization forecasts. Importantly, we also present two different versions of a predictive model. RFR model 1 has high predictive performance and can be used in places where recent vaccine utilization data is available. RFR model 2 has slightly less predictive performance but can easily be adapted to other regions and countries. It has broader application scope. This is a tool that can help translate the Global Vaccine Action Plan for 2011&#x2013;2020 into action: meeting vaccination coverage targets in every region, country and community and strengthening health systems by empowering program managers with access to high quality information on stock needs at each specific location.</p>
</sec>
</body>
<back>
<sec id="s5">
<title>Data Availability Statement</title>
<p>The data analyzed in this study is subject to the following licenses/restrictions: We will share all data referenced here upon permission of the United Republic of Tanzania, anyone who wishes to see the data must first gain permission from the rightful owner of the data, the Government of Tanzania. Requests to access these datasets should be directed to the Government of Tanzania.</p>
</sec>
<sec id="s6">
<title>Author Contributions</title>
<p>BF, DA, and SS designed the study. RH led the analysis and writing of the manuscript. JS, GG, and AS participated in the analysis, manuscript editing and brainstorming. SS also provided overall guidance.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>Our work was funded by the Bill and Melinda Gates Foundation Grand Challenges Exploration Grant Machine Learning for a More Efficient Supply Chain, OPP1181789. We are grateful for the sustained support of the Bill and Melinda Gates Foundation &#x2010; and in particular David Sarley and Tove Ryman who supported the fundamental innovation of this work in the form of knowledge, commitment, and resources.</p>
</sec>
<sec id="s8" sec-type="COI-statement">
<title>Conflict of Interest</title>
<p>All authors are employed by the company Macro-Eyes, Inc.</p>
</sec>
<ack>
<p>We are grateful to the Ministry of Health of Tanzania and our partner, PATH. PATH colleagues brought experience in vaccines and made it feasible to pilot our approach in three regions of Tanzania.</p>
</ack>
<sec id="s9">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frai.2020.559617/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frai.2020.559617/full&#x23;supplementary-material</ext-link>.</p>
<supplementary-material xlink:href="image1.jpeg" id="SM1" mimetype="application/jpeg" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="image2.jpeg" id="SM2" mimetype="application/jpeg" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="image3.jpeg" id="SM3" mimetype="application/jpeg" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="image4.jpeg" id="SM4" mimetype="application/jpeg" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Breiman</surname>
</name>
<name>
<surname>Friedman</surname>
</name>
</person-group> (<year>1984</year>). <source>Classification and regression trees</source>. <publisher-name>Abingdon&#x2010;on&#x2010;Thames</publisher-name>, <publisher-loc>Oxfordshire, UK</publisher-loc>: <publisher-name>Taylor and Francis</publisher-name>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Breiman</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>Random forests</article-title>. <source>Mach. Learn.</source> <volume>45</volume> (<issue>1</issue>), <fpage>5</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1023/a:1010933404324</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Fruggiero</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Iannone</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Martino</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Miranda</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Riemma</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>A forecast model for pharmaceutical requirements based on an artificial neural network</article-title>,&#x201d; in <conf-name>Proceedings of 2012 IEEE international conference on service operations and logistics, and informatics</conf-name>, <conf-loc>Suzhou, China</conf-loc>, <conf-date>July 8&#x2013;10, 2012</conf-date> (<publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>263</fpage>&#x2013;<lpage>268</lpage>.</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>P&#xe9;rez</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Granger</surname>
<given-names>B. E.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>IPython: a system for interactive scientific computing</article-title>. <source>Comput. Sci. Eng.</source> <volume>9</volume> (<issue>3</issue>), <fpage>21</fpage>&#x2013;<lpage>29</lpage>. <pub-id pub-id-type="doi">10.1109/mcse.2007.53</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="web">
<comment>GPS Visualizer</comment>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://www.gpsvisualizer.com/elevation">http://www.gpsvisualizer.com/elevation</ext-link>
</comment> (<comment>Accessed March 28, 2019</comment>).</citation>
</ref>
<ref id="B6">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Howard</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gugger</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>fastai: A layered API for Deep Learning</article-title>. <source>Information</source> <volume>11</volume> (<issue>2</issue>), <lpage>108</lpage>, <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/fastai/fastai">https://github.com/fastai/fastai</ext-link>
</comment> </citation>
</ref>
<ref id="B7">
<citation citation-type="web">
<comment>Logistimo; Products</comment>. (<year>2011</year>). <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.logistimo.com/product.html">https://www.logistimo.com/product.html</ext-link>
</comment> (<comment>Accessed January 16, 2019</comment>).</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lydon</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Schreiber</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Gasca</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dumolard</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Urfer</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Senouci</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Vaccine stockouts around the world: are essential vaccines always available when needed?</article-title> <source>Vaccine</source> <volume>35</volume> (<issue>17</issue>), <fpage>2121</fpage>&#x2013;<lpage>2126</lpage>. <pub-id pub-id-type="doi">10.1016/j.vaccine.2016.12.071</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mueller</surname>
<given-names>L. E.</given-names>
</name>
<name>
<surname>Haidari</surname>
<given-names>L. A.</given-names>
</name>
<name>
<surname>Wateska</surname>
<given-names>A. R.</given-names>
</name>
<name>
<surname>Phillips</surname>
<given-names>R. J.</given-names>
</name>
<name>
<surname>Schmitz</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Connor</surname>
<given-names>D. L.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>The impact of implementing a demand forecasting system into a low-income country&#x27;s supply chain</article-title>. <source>Vaccine</source> <volume>34</volume> (<issue>32</issue>), <fpage>3663</fpage>&#x2013;<lpage>3669</lpage>. <pub-id pub-id-type="doi">10.1016/j.vaccine.2016.05.027</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Orenstein</surname>
<given-names>W. A.</given-names>
</name>
<name>
<surname>Ahmed</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Simply put: vaccination saves lives</article-title>. <source>Proc. Natl. Acad. Sci. U.S.A.</source> <volume>144</volume> (<issue>16</issue>), <fpage>4031</fpage>&#x2013;<lpage>4033</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1704507114</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ozawa</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Clark</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Portnoy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Grewal</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Brenzel</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Walker</surname>
<given-names>D. G.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Return on investment from childhood immunization in low- and middle-income countries, 2011&#x2013;20</article-title>. <source>Health Aff.</source> <volume>35</volume>, <fpage>199</fpage>&#x2013;<lpage>207</lpage>. <pub-id pub-id-type="doi">10.1377/hlthaff.2015.1086</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Patel</surname>
<given-names>P. B.</given-names>
</name>
<name>
<surname>Rana</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Jangid</surname>
<given-names>S. G.</given-names>
</name>
<name>
<surname>Bavarva</surname>
<given-names>N. R.</given-names>
</name>
<name>
<surname>Patel</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Bansal</surname>
<given-names>R. K.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Vaccine wastage assessment after introduction of open vial policy in surat municipal corporation area of India</article-title>. <source>Int. J. Health Pol. Manag.</source> <volume>5</volume> (<issue>4</issue>), <fpage>233</fpage>&#x2013;<lpage>236</lpage>. <pub-id pub-id-type="doi">10.15171/ijhpm.2015.208</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="web">
<collab>Path and World Health Organization</collab> (<year>2011</year>). <article-title>Developing a vision for immunization supply systems. in 2020: landscape analysis summaries</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://www.path.org/publications/files/TS_opt_vision_2020.pdf">http://www.path.org/publications/files/TS_opt_vision_2020.pdf</ext-link>
</comment> (<comment>Accessed January 16, 2019</comment>).</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pedregosa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Varoquaux</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gramfort</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Michel</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Thirio</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Scikit-learn: machine learning in Python</article-title>, <source>J. Mach. Learn. Res.</source> <volume>12</volume>, <fpage>2825</fpage>&#x2013;<lpage>2830</lpage>.</citation>
</ref>
<ref id="B15">
<citation citation-type="web">
<collab>Python Software Foundation</collab> (<year>2018</year>). <article-title>Python language reference, version 3.6</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://www.python.org">http://www.python.org</ext-link>
</comment>.</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rajgopal</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Connor</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Assi</surname>
<given-names>T.-M.</given-names>
</name>
<name>
<surname>Norman</surname>
<given-names>B. A.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>S.-I.</given-names>
</name>
<name>
<surname>Bailey</surname>
<given-names>R. R.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>The optimal number of routine vaccines to order at health clinics in low or middle income countries</article-title>. <source>Vaccine</source> <volume>29</volume>, <fpage>5512</fpage>&#x2013;<lpage>5518</lpage>. <pub-id pub-id-type="doi">10.1016/j.vaccine.2011.05.044</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="web">
<comment>Seaborn 0.8.1</comment>. <comment>Available at <ext-link ext-link-type="uri" xlink:href="http://seaborn.pydata.org/index.html">http://seaborn.pydata.org/index.html</ext-link>
</comment> (<comment>Accessed January 16, 2019</comment>). <pub-id pub-id-type="doi">10.1007/978-1-4612-0689-7</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="web">
<comment>Tanzania Health Ministry Registry</comment>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://hfr-portal.ucchosting.co.tz/">https://hfr-portal.ucchosting.co.tz/</ext-link>
</comment> (<comment>Accessed March 28, 2019</comment>).</citation>
</ref>
<ref id="B19">
<citation citation-type="web">
<comment>Tanzania National Bureau of Statistics</comment>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.nbs.go.tz/">https://www.nbs.go.tz/</ext-link>
</comment> (<comment>Accessed March 28, 2019</comment>).</citation>
</ref>
<ref id="B20">
<citation citation-type="book">
<collab>John Snow, Inc. and USAID</collab> (<year>2010</year>). <source>Pipeline 5.1</source>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://deliver.jsi.com/John%20Snow,%20Inc%20%26%20Usaid">http://deliver.jsi.com/John Snow, Inc &#x26; Usaid</ext-link>
</comment> (<comment>Accessed January 16, 2019</comment>).</citation>
</ref>
</ref-list>
</back>
</article>
