<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurorobot.</journal-id>
<journal-title>Frontiers in Neurorobotics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurorobot.</abbrev-journal-title>
<issn pub-type="epub">1662-5218</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnbot.2020.578675</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Detecting Changes and Avoiding Catastrophic Forgetting in Dynamic Partially Observable Environments</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Dick</surname> <given-names>Jeffery</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/869377/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Ladosz</surname> <given-names>Pawel</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1157831/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Ben-Iwhiwhu</surname> <given-names>Eseoghene</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1157433/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Shimadzu</surname> <given-names>Hideyasu</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1019989/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Kinnell</surname> <given-names>Peter</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/785970/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Pilly</surname> <given-names>Praveen K.</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/72259/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Kolouri</surname> <given-names>Soheil</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1157415/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Soltoggio</surname> <given-names>Andrea</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/63326/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Computer Science, Loughborough University</institution>, <addr-line>Loughborough</addr-line>, <country>United Kingdom</country></aff>
<aff id="aff2"><sup>2</sup><institution>Mathematical Sciences, Loughborough University</institution>, <addr-line>Loughborough</addr-line>, <country>United Kingdom</country></aff>
<aff id="aff3"><sup>3</sup><institution>Teikyo University Graduate School of Public Health</institution>, <addr-line>Tokyo</addr-line>, <country>Japan</country></aff>
<aff id="aff4"><sup>4</sup><institution>School of Mechanical, Electrical and Manufacturing Engineering, Loughborough University</institution>, <addr-line>Loughborough</addr-line>, <country>United Kingdom</country></aff>
<aff id="aff5"><sup>5</sup><institution>HRL Laboratories</institution>, <addr-line>Malibu, CA</addr-line>, <country>United States</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: James Leland Olds, George Mason University, United States</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Eiji Uchibe, Advanced Telecommunications Research Institute International (ATR), Japan; Juan V. Sanchez-Andres, University of Jaume I, Spain</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Jeffery Dick <email>j.dick&#x00040;lboro.ac.uk</email></corresp>
<corresp id="c002">Andrea Soltoggio <email>a.soltoggio&#x00040;lboro.ac.uk</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>23</day>
<month>12</month>
<year>2020</year>
</pub-date>
<pub-date pub-type="collection">
<year>2020</year>
</pub-date>
<volume>14</volume>
<elocation-id>578675</elocation-id>
<history>
<date date-type="received">
<day>30</day>
<month>06</month>
<year>2020</year>
</date>
<date date-type="accepted">
<day>20</day>
<month>11</month>
<year>2020</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2020 Dick, Ladosz, Ben-Iwhiwhu, Shimadzu, Kinnell, Pilly, Kolouri and Soltoggio.</copyright-statement>
<copyright-year>2020</copyright-year>
<copyright-holder>Dick, Ladosz, Ben-Iwhiwhu, Shimadzu, Kinnell, Pilly, Kolouri and Soltoggio</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license> </permissions>
<abstract><p>The ability of an agent to detect changes in an environment is key to successful adaptation. This ability involves at least two phases: learning a model of an environment, and detecting that a change is likely to have occurred when this model is no longer accurate. This task is particularly challenging in partially observable environments, such as those modeled with partially observable Markov decision processes (POMDPs). Some predictive learners are able to infer the state from observations and thus perform better with partial observability. Predictive state representations (PSRs) and neural networks are two such tools that can be trained to predict the probabilities of future observations. However, most such existing methods focus primarily on static problems in which only one environment is learned. In this paper, we propose an algorithm that uses statistical tests to estimate the probability of different predictive models to fit the current environment. We exploit the underlying probability distributions of predictive models to provide a fast and explainable method to assess and justify the model&#x00027;s beliefs about the current environment. Crucially, by doing so, the method can label incoming data as fitting different models, and thus can continuously train separate models in different environments. This new method is shown to prevent catastrophic forgetting when new environments, or tasks, are encountered. The method can also be of use when AI-informed decisions require justifications because its beliefs are based on statistical evidence from observations. We empirically demonstrate the benefit of the novel method with simulations in a set of POMDP environments.</p></abstract>
<kwd-group>
<kwd>POMDP</kwd>
<kwd>PSR</kwd>
<kwd>continual learning</kwd>
<kwd>catastrophic forgetting</kwd>
<kwd>lifelong learning</kwd>
<kwd>neural network</kwd>
</kwd-group>
<contract-num rid="cn001">FA8750-18-C-0103</contract-num>
<contract-sponsor id="cn001">Defense Advanced Research Projects Agency<named-content content-type="fundref-id">10.13039/100000185</named-content></contract-sponsor>
<counts>
<fig-count count="8"/>
<table-count count="0"/>
<equation-count count="7"/>
<ref-count count="38"/>
<page-count count="14"/>
<word-count count="9505"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1. Introduction</title>
<p>A useful skill for an agent that explores the world and learns to act in it is the ability to predict what happens next (Geisser, <xref ref-type="bibr" rid="B11">1993</xref>). One way is to try to learn a model of the world so that predictions are generated within the agent and compared with observations to improve the model. However, this idea assumes that it is possible to learn a large and static model of the entire world. In reality, it is more feasible to try to learn a model of a subset of the world, i.e., an environment. Therefore, an agent may interact in different environments at different points in time. This is a condition that challenges learning algorithms that often need to be set manually by a user that labels tasks or substitutes models for each new task or environment.</p>
<p>A more effective agent would be able to learn different tasks or environments more autonomously, incorporating new knowledge without forgetting the skills learned in a previous task or environment. To accomplish this, an agent needs to learn an environment and detect when a change occurs, or when a completely new environment is met. By doing so, a lifelong learning agent will also remember previously learned environments, and quickly recover their models if old conditions return.</p>
<p>The majority of current machine learning approaches, however, assume that learning occurs in one static environment. These approaches make it possible to optimize policies for one problem, but do not scale well to learning multiple problems, or to learning in an incremental way more tasks over a lifetime (Thrun, <xref ref-type="bibr" rid="B33">1998</xref>). Approaches known as meta reinforcement learning (Finn et al., <xref ref-type="bibr" rid="B10">2017</xref>; Rothfuss et al., <xref ref-type="bibr" rid="B28">2018</xref>; Rakelly et al., <xref ref-type="bibr" rid="B25">2019</xref>; Zintgraf et al., <xref ref-type="bibr" rid="B38">2019</xref>) are designed to learn multiple tasks, but they assume that a signal is given when the task changes. While this is a step toward learning more tasks sequentially, such algorithms still require a teaching signal that labels different tasks. Additionally, reinforcement learning algorithms are intended to optimize a policy that maximizes reward (Sutton and Barto, <xref ref-type="bibr" rid="B32">2018</xref>). Therefore, the knowledge of the environment is implicit in the policy and shaped by the reward function.</p>
<p>One method to quickly adapt to changes in the environment is provided by reinforcement learning approaches that explicitly model the environment, and therefore can rapidly search the parameter space when those change. These approaches, known as model-based reinforcement learning (Doya et al., <xref ref-type="bibr" rid="B8">2002</xref>; Nagabandi et al., <xref ref-type="bibr" rid="B21">2018</xref>; Lecarpentier and Rachelson, <xref ref-type="bibr" rid="B16">2019</xref>), require the model to be hand-designed, and also learn policies with the aim to maximize a reward. In short, because reinforcement learning aims to provide actions that maximize reward, their applicability is limited when there are no actions or rewards available.</p>
<p>If an environment does not provide rewards, it can still be explored and learned with the aim, e.g., to predict future events. The concepts of Markov chain and of Markov decision process (MDP) are abstractions to model an environment when actions are, respectively, absent or present (Bellman, <xref ref-type="bibr" rid="B2">1957</xref>). An extension of MDPs are partially observable Markov decision processes (POMDPs) that account for the fact that observations from a system do not always reveal the state. POMDPs have many important applications in the real world, e.g., the airplane collision avoidance system ACAS X is based on POMDP models (Kochenderfer et al., <xref ref-type="bibr" rid="B14">2015</xref>). POMDPs have also been used to model brain information processing for decision making under uncertainty (Rao, <xref ref-type="bibr" rid="B26">2010</xref>), and to model working memory in the brain (Todd et al., <xref ref-type="bibr" rid="B34">2009</xref>). While POMDPs are a flexible tool to model a variety of real world systems, the assumption of partial observability of the underlying states in the observed environment is precisely what makes it difficult to derive accurate POMDPs from data.</p>
<p>Because POMDPs can include a reward function, much of the research in learning POMDPs falls under reinforcement learning theory and is intended to find an optimal policy in a rewarding environment (see e.g., Shani et al., <xref ref-type="bibr" rid="B31">2005</xref>). One exception is the Baum-Welch algorithm (Rabiner, <xref ref-type="bibr" rid="B24">1989</xref>), designed to generate Hidden Markov Models, that can be adapted for POMDP environments by incorporating actions. One limitation of this method is that it requires knowing the number of states in advance, and works best when provided with an initial estimate of the underlying POMDP.</p>
<p>Predictive state representation (PSR) is a general representation of a POMDP that does not need to learn the underlying states. PSRs, instead, learn the probabilities of future observations. Due to the nature of PSR methods, which learn directly from observations rather than trying to find hidden underlying states, discovering and learning an accurate PSR of a POMDP environment is faster than trying to reconstruct the underlying MDP (Littman et al., <xref ref-type="bibr" rid="B18">2002</xref>). A variety of algorithms have been proposed to improve the learning of PSRs such as transformed predictive state representations (TPSRs) (Rosencrantz et al., <xref ref-type="bibr" rid="B27">2004</xref>) and compressed predictive state representations (CPSRs) (Hamilton et al., <xref ref-type="bibr" rid="B12">2013</xref>). Some algorithms improve the learning method, often utilizing TPSRs and CPSRs (McCracken and Bowling, <xref ref-type="bibr" rid="B20">2005</xref>; Yun-Long and Ren-Hou, <xref ref-type="bibr" rid="B37">2009</xref>; Liu et al., <xref ref-type="bibr" rid="B19">2016</xref>; Downey et al., <xref ref-type="bibr" rid="B7">2017</xref>), while others allow the agent to learn in more complex domains, e.g., with continuous action or observation spaces (Wingate and Singh, <xref ref-type="bibr" rid="B35">2007</xref>; Boots et al., <xref ref-type="bibr" rid="B4">2013</xref>).</p>
<p>Other parametric models such as neural networks (Bishop, <xref ref-type="bibr" rid="B3">1995</xref>) can also take a history of recent actions and observations as input, and be trained to predict the next observation. These approaches are less explainable, but have grown in popularity with the resurgence of neural networks, the use of deep and recurrent networks, and powerful hardware for training (Schmidhuber, <xref ref-type="bibr" rid="B29">2015</xref>).</p>
<p>The approaches cited so far assume that the environment is stationary. Therefore, we can hypothesize that under dynamic conditions where the agent switches between environments over time, such approaches will either learn an average of the environments, or learn accurately the most recent environment while forgetting the previous ones. One example of dynamic conditions is an autonomous driving problem in which a vehicle encounters different environments, such as different driving rules or weather conditions. In theory, two or more POMDP models that alternate over time can be modeled as one single POMDP in which a non-observable state determines the sub-part of the model that generates the current data. However, this approach is likely to increase the complexity of the model significantly. Thus, a hierarchical approach in which different POMDP models are used to predict different environments may be more scalable.</p>
<p>Assume, e.g., a system in which a transition from a state A to B occurs consistently with probability 1, but after some time has passed, the environment dynamics change such that state A leads to C with probability 1. The challenge in learning this case with one single model is that rather than capturing the hidden state, the model could learn that the environment transitions from A to B or to C with a probability of 0.5 for each state. This is true on average, but inaccurate at any particular point in time. As a consequence, the result will be either a model with low accuracy, if a slow learning rate is used, or catastrophic forgetting if a faster learning rate is used.</p>
<p>The idea presented in this paper is to explicitly learn such hierarchically nested hidden states by means of a statistical framework that selects different predictive models to fit a data stream at different times. The proposed approach tracks the probability of a current window of data of fitting different models, and thus the probability of an agent being in one of many possible environments when the only cues, e.g., observations and actions, are implicit in the data stream. This is done by comparing the expected frequencies of observations derived from predictive models with the observed frequencies in the current data stream. Discrete observations and actions follow multinomial distributions, thus, performing &#x003C7;<sup>2</sup> tests is a viable method of estimating the probability of the new observed data to fit a learned predictive model.</p>
<p>An important consequence of assessing a model&#x00027;s probability of fitting the current data is that data points at different times can be assigned to different models to improve them separately and independently. By doing so, we can learn different models under the assumption of stationary conditions and implement continual learning of multiple environments, thus avoiding catastrophic forgetting in dynamic POMDPs. The proposed method provides an evidence-based and explainable algorithm to justify the belief of the system. Moreover, the novel approach does not need reward signals to learn models for different environments, making it a more general method than reward-based approaches such as reinforcement learning.</p>
<p>The next section provides the background on POMDPs that is necessary to introduce the novel algorithm that we name adaptive model detection (AMD). We also briefly introduce PSRs and a simple neural network as baseline model learners to be employed by the novel AMD algorithm explained in section 3. Simulation results are presented in section 4 followed by a discussion and conclusion.</p></sec>
<sec id="s2">
<title>2. Background</title>
<p>Predictive models such as predictive state representations (Littman et al., <xref ref-type="bibr" rid="B18">2002</xref>), neural networks and POMDPs have been extensively used in the past to model dynamical systems with discrete representations. This section provides the background for these approaches that lay the framework for the method in this paper.</p>
<sec>
<title>2.1. Predictive State Representation (PSR) and POMDP</title>
<p>Predictive state representation (PSR) is a model to predict observations in stochastic environments, including POMDPs. A POMDP is defined as a hextuple <inline-formula><mml:math id="M1"><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="-tex-caligraphic">S</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">A</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mi>T</mml:mi><mml:mo>,</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mo>&#x003A9;</mml:mo><mml:mo>,</mml:mo><mml:mi>R</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula>, where <inline-formula><mml:math id="M2"><mml:mrow><mml:mi mathvariant="-tex-caligraphic">S</mml:mi></mml:mrow></mml:math></inline-formula> is the set of underlying MDP states; <inline-formula><mml:math id="M3"><mml:mrow><mml:mi mathvariant="-tex-caligraphic">A</mml:mi></mml:mrow></mml:math></inline-formula> is the set of actions; <italic>T</italic> is the transition function, <inline-formula><mml:math id="M4"><mml:mi>T</mml:mi><mml:mo>:</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">A</mml:mi></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">S</mml:mi></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">S</mml:mi></mml:mrow><mml:mo>&#x02192;</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:math></inline-formula>, which gives the probability of transitioning from one state to another given the action taken; <italic>R</italic> is the reward function; <inline-formula><mml:math id="M5"><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow></mml:math></inline-formula> is the set of observations; and &#x003A9; is the set of conditional observation probabilities. POMDPs differ from MDPs in that the current observation is not sufficient for an agent to be able to determine its underlying state.</p>
<p>Let <italic>t</italic> be a finite stream of action-observation pairs. Then <italic>t</italic> is a test, and we let <inline-formula><mml:math id="M6"><mml:mrow><mml:mi mathvariant="-tex-caligraphic">T</mml:mi></mml:mrow></mml:math></inline-formula> be the set of all tests. Let the history <italic>h</italic><sub><italic>i</italic></sub> &#x02208; <italic>H</italic> of the agent at time <italic>i</italic> be the stream of action-observation pairs <inline-formula><mml:math id="M7"><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">A</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mo>&#x02200;</mml:mo><mml:mi>j</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02229;</mml:mo><mml:mi>&#x02115;</mml:mi></mml:math></inline-formula> observed up to time <italic>i</italic>.</p>
<p>A PSR (Littman et al., <xref ref-type="bibr" rid="B18">2002</xref>) of an environment consists of a set of core tests, <italic>Q</italic>; a set of |<italic>Q</italic>|-dimensional <italic>m</italic><sub><italic>a,o,t</italic></sub> vectors for all <inline-formula><mml:math id="M8"><mml:mi>a</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">A</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mi>o</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>Q</mml:mi></mml:math></inline-formula>; and an initial state.</p>
<p>The set of core tests, <italic>Q</italic>, is a finite subset of <inline-formula><mml:math id="M9"><mml:mrow><mml:mi mathvariant="-tex-caligraphic">T</mml:mi></mml:mrow></mml:math></inline-formula>, with the property that <italic>P</italic>(<italic>t</italic> &#x02223; <italic>h</italic>) for all <inline-formula><mml:math id="M10"><mml:mi>t</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">T</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>H</mml:mi></mml:math></inline-formula> can be found as some <italic>linear combination</italic> of the probabilities <italic>P</italic>(<italic>q</italic> &#x02223; <italic>h</italic>) for all <italic>q</italic> &#x02208; <italic>Q</italic>. The empty test, &#x003F5; &#x0003D; {}, is always included in <italic>Q</italic>, such that <italic>P</italic>(&#x003F5; &#x02223; <italic>h</italic>) &#x0003D; 1 for all <italic>h</italic> which are possible under the PSR model. The PSR state vector after observing history <italic>h</italic>, <italic>y</italic>(<italic>h</italic>), is a (1 &#x000D7; |<italic>Q</italic>|) vector which holds <italic>P</italic>(<italic>q</italic> &#x02223; <italic>h</italic>) for each <italic>q</italic> &#x02208; <italic>Q</italic>. By stacking the rows of <italic>m</italic><sub><italic>a,o,t</italic></sub> for all <italic>t</italic> &#x02208; <italic>Q</italic>, we obtain a (|<italic>Q</italic>| &#x000D7; |<italic>Q</italic>|) <italic>projection vector</italic> <italic>M</italic><sub><italic>a,o</italic></sub> for every length 1 test (<italic>a,o</italic>). For all <italic>h</italic> &#x02208; <italic>H</italic> we have that <inline-formula><mml:math id="M11"><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>o</mml:mi><mml:mo>&#x02223;</mml:mo><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>a</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>y</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>. Projection vectors for longer tests can be created by multiplying projection vectors for shorter tests. For example, <italic>M</italic><sub><italic>a</italic><sub>1</sub>,<italic>o</italic><sub>1</sub>,<italic>a</italic><sub>2</sub>,<italic>o</italic><sub>2</sub></sub> &#x0003D; <italic>M</italic><sub><italic>a</italic><sub>2</sub>,<italic>o</italic><sub>2</sub></sub> &#x000D7; <italic>M</italic><sub><italic>a</italic><sub>1</sub>,<italic>o</italic><sub>1</sub></sub>, where &#x000D7; is matrix multiplication.</p>
<p>To maintain an accurate state vector, <italic>m</italic><sub><italic>a,o,t</italic></sub> must be available for all <inline-formula><mml:math id="M12"><mml:mi>a</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">A</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mi>o</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>Q</mml:mi></mml:math></inline-formula>. Let <italic>Q</italic> &#x0003D; {<italic>t</italic><sub>1</sub>, <italic>t</italic><sub>2</sub>, &#x02026;, <italic>t</italic><sub><italic>n</italic></sub>}. This can be used to obtain the state vector at time <italic>i</italic>, <italic>y</italic>(<italic>h</italic><sub><italic>i</italic></sub>), given the <italic>a</italic><sub><italic>i</italic></sub>, <italic>o</italic><sub><italic>i</italic></sub> action observation pair observed at timestep <italic>i</italic>, and <italic>y</italic>(<italic>h</italic><sub><italic>i</italic>&#x02212;1</sub>). Recall that the PSR state vector contains the probabilities of each core test, and let <italic>y</italic><sub><italic>j</italic></sub>(<italic>h</italic>) denote the element in <italic>y</italic>(<italic>h</italic>) corresponding to core test <italic>t</italic><sub><italic>j</italic></sub>. Then, the probability of each core test can be found as follows. For all <italic>t</italic><sub><italic>j</italic></sub> &#x02208; <italic>Q</italic>:
<disp-formula id="E1"><label>(1)</label><mml:math id="M13"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02223;</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>y</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:mi>y</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:mfrac><mml:mtext>&#x02003;</mml:mtext><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
Equivalently, we can use the previously defined projection vectors:
<disp-formula id="E2"><label>(2)</label><mml:math id="M14"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:mi>y</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>y</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:mi>y</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:mfrac><mml:mtext>&#x02003;</mml:mtext><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
From the definition above, it follows that PSRs can give an indication of the probability of certain transitions to occur. In particular, the following theorem specifies the probability of observing particular action observation pairs:</p>
<p><bold>THEOREM 1</bold>. Given that the state vector is <italic>y</italic>(<italic>h</italic><sub><italic>i</italic>&#x02212;1</sub>) at time <italic>i</italic> &#x02212; 1, the probability of seeing observation <italic>o</italic><sub><italic>i</italic></sub> after taking action <italic>a</italic><sub><italic>i</italic></sub> at time step <italic>i</italic> is given by the following equation
<disp-formula id="E3"><label>(3)</label><mml:math id="M15"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>y</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:mtext>&#x02003;</mml:mtext><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
Proof: By construction. Note that the state vector contains enough information to accurately predict future observations even in partially observable environments. The state vector acts not only as a prediction, but also as an internal state for the PSR model.</p>
<p>A natural question to ask is, &#x02018;what is the predictive state of an empty history <italic>y</italic>(&#x003F5;)?&#x02019;. If the environment is known to always start in a given underlying state, the corresponding predictive state may be used. However, this is a strong assumption in general. If we assume the agent is likely to start in each underlying state proportionally to the amount of time previously spent in that state, <italic>y</italic>(&#x003F5;) can be set to the <italic>stationary distribution</italic>. The stationary distribution, which is the weighted expected value of <italic>y</italic> over all time steps, is given by
<disp-formula id="E4"><label>(4)</label><mml:math id="M16"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:mi>y</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003F5;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mstyle displaystyle="true"><mml:msub><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>H</mml:mi></mml:mrow></mml:msub></mml:mstyle><mml:mi>y</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mi>H</mml:mi><mml:mo>|</mml:mo></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
where <italic>H</italic> is the set of all histories. As <italic>H</italic> is an infinite set, this is calculated instead from the histories that the agent has seen. When calculated this way, the stationary distribution may change depending on the policy followed by the agent. Additionally, as the stationary distribution represents a distribution over all states, it may not be a state that the agent can reach through normal exploration; however, it represents a positive probability for all states previously visited.</p>
<p>The state space of the PSR is the set of states that can be generated recursively by <inline-formula><mml:math id="M17"><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x000D7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x000D7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:mrow></mml:mfrac></mml:math></inline-formula>, for all <inline-formula><mml:math id="M18"><mml:mi>a</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">A</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mi>o</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow></mml:math></inline-formula>, where <italic>y</italic>&#x02032; is known to be in the state space of the PSR, and <inline-formula><mml:math id="M19"><mml:mi>y</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi><mml:mo>,</mml:mo><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is non-zero. In order to generate the set of states, an initial state <italic>y</italic>&#x02032; must be assumed to be in the state space. Sometimes an initial state is provided when the starting state of the environment is known. However, when no initial state is provided, the stationary distribution may be used.</p>
<p>There are several algorithms for learning PSRs offline, but relatively few for learning and discovering tests online. One such algorithm is the constrained gradient algorithm (McCracken and Bowling, <xref ref-type="bibr" rid="B20">2005</xref>), which we use in our experiments for learning PSRs.</p></sec>
<sec>
<title>2.2. Neural Network Predictors</title>
<p>A simple neural network (Bishop, <xref ref-type="bibr" rid="B3">1995</xref>) can be trained to predict observations in a given environment. Given a time window of duration <italic>i</italic>, a neural network can take the observations <inline-formula><mml:math id="M20"><mml:mi>o</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow></mml:math></inline-formula> and the actions <inline-formula><mml:math id="M21"><mml:mi>a</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">A</mml:mi></mml:mrow></mml:math></inline-formula> and predict the new observation <italic>o</italic> at time <italic>i</italic> &#x0002B; 1. If the softmax transfer function is used to produce output probabilities, the network can also be trained to predict the probability of each observation at <italic>i</italic> &#x0002B; 1 in stochastic environments. The difference between the prediction and the observation can be used to train such a system with gradient descent. As for the PSR model explained in the previous section, a neural network predictor can be trained effectively only if stationary conditions are assumed during the training phase. Thus, changes in the environment such as those occurring in dynamic POMDPs (see next section) are challenging conditions that this study addresses.</p>
<p>Neural networks, PSRs, and other predictive models have one thing in common: they give as output a prediction of the probability of seeing each possible observation next. The sum of probabilities of seeing each observation is 1. Let <italic>K</italic> be a predictive model, <inline-formula><mml:math id="M22"><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow></mml:math></inline-formula> be the set of possible observations, and <inline-formula><mml:math id="M23"><mml:mrow><mml:mi mathvariant="-tex-caligraphic">A</mml:mi></mml:mrow></mml:math></inline-formula> be the set of possible actions. Then, we define <italic>P</italic>(<italic>o</italic> &#x02223; <italic>K</italic><sub><italic>i</italic></sub>) to be the predicted probability of observing <italic>o</italic> at time <italic>i</italic> given by <italic>K</italic>. Additionally, we define <inline-formula><mml:math id="M24"><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow><mml:mo>&#x02223;</mml:mo><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> as the probability distribution over all observations at time <italic>i</italic> given by <italic>K</italic>.</p></sec>
<sec>
<title>2.3. Dynamic POMDPs</title>
<p>Assume that an environment remains stationary for a certain amount of time. Under this assumption, it is possible to learn a model of the environment (e.g., a PSR, as detailed in McCracken and Bowling, <xref ref-type="bibr" rid="B20">2005</xref>). If, after a certain amount of time, the environment changes, the continual training of the same model will lead to inaccurate predictions at first, and catastrophic forgetting of the first environment in the long term. This is because the assumption of stationary conditions is not valid, and one model tries to learn an average distribution of two or more distributions that occur in different environments. These changing environments are similar to switching hidden Markov models (SHMMs) investigated in Chuk et al. (<xref ref-type="bibr" rid="B5">2020</xref>) and H&#x000F6;ffken et al. (<xref ref-type="bibr" rid="B13">2009</xref>).</p>
<p>Let &#x1D54D; &#x0003D; {<italic>V</italic><sub>1</sub>, <italic>V</italic><sub>2</sub>, &#x02026;, <italic>V</italic><sub><italic>m</italic></sub>} be a set of distinct POMDP environments. A dynamic POMDP environment, <italic>D</italic>, behaves as a single environment <italic>V</italic><sub><italic>i</italic></sub> &#x02208; &#x1D54D; for a number of time steps, <italic>n</italic><sub>0</sub>, before changing its behavior to another environment, <italic>V</italic><sub><italic>j</italic></sub> &#x02208; &#x1D54D;. The dynamic environment <italic>D</italic> may switch to any environment in &#x1D54D; every <italic>n</italic><sub><italic>k</italic></sub> time steps, with <italic>k</italic> &#x02208; &#x02115; and <italic>n</italic><sub><italic>k</italic></sub> &#x0226B; 1. Effectively, these dynamics can be seen as hierarchical large stationary models where a state variable <italic>z</italic> &#x02208; &#x02115; determines the specific environment <italic>V</italic><sub><italic>z</italic></sub> at a given time. However, <italic>z</italic> is not observable and can only be derived inferring which specific environment from the set &#x1D54D; matches the current stream of data. We assume that transitions between environments in &#x1D54D; occur with considerably lower frequency than state transitions within the environments <italic>V</italic><sub><italic>z</italic></sub>. This assumption reflects two points: (1) in real world scenarios, generally, two environments can be thought of as being distinct when transitions from one to another occur rarely, otherwise it makes more sense to consider them as one environment; (2) for a model to learn and predict one environment, it is necessary to experience the environment for a minimum number of steps that capture transitions within it.</p>
<p>Such dynamic conditions occur typically when an agent learns to predict an environment, e.g., to navigate in a house, and is then required to learn a new somewhat different environment, e.g., to navigate in an office. The new environment might bear a similarity with the previous one, but also significant differences. Desirable skills of an agent include the ability to detect that there is a new environment, the ability to learn the new environment quickly, possibly exploiting previous knowledge, and also retain the knowledge of the previous environment (avoiding catastrophic forgetting). The aim of this study is precisely to achieve such capabilities as explained in the next section.</p></sec></sec>
<sec id="s3">
<title>3. Adaptive Model Detection</title>
<p>The idea and the method for detecting different environments and training different models according to such detection is explained in this section. We name our new approach adaptive model detection (AMD) because it detects likely models to fit the data, and works with adaptive models that evolve as new data is collected.</p>
<sec>
<title>3.1. Statistical Model Selection</title>
<p>Given a set of statistical models that each predict an environment, a question that can be asked is: what model is best at predicting a given stream of data? Several approaches have been proposed in the literature to perform <italic>model selection</italic> (Cox, <xref ref-type="bibr" rid="B6">2006</xref>). Approaches for model selections are based on the information theory such as the Akaike information criterion (Akaike, <xref ref-type="bibr" rid="B1">1974</xref>) and the Bayesian (or Schwarz) information criterion (Schwarz, <xref ref-type="bibr" rid="B30">1978</xref>). The idea is to measure the information that is lost due to the difference in statistical distributions between the model and the data using the Kullback-Leibler divergence (Kullback, <xref ref-type="bibr" rid="B15">1997</xref>). By doing so, it is possible to select a best fitting model that minimizes this difference. Different statistical methods, however, may be more or less appropriate or accurate according to a number of factors including the number of data points and the assumptions on the statistical distributions that are being observed.</p>
<p>Given a recent window of data from one environment, <italic>hypothesis testing</italic> can be used to accept or reject the null hypothesis that the distribution of the environment data matches the distribution of the model-generated data (Lehmann and Romano, <xref ref-type="bibr" rid="B17">2006</xref>). The ability to accept or reject such a null hypothesis is a valuable tool, particularly in the context of explainable AI applications in which a model is used to predict a data stream. If the current data stream has a very low <italic>p</italic>-value, it is reasonable to reject the null hypothesis that the model is correct. On the contrary, if the null hypothesis cannot be rejected, the model offers a good approximation and it is therefore reasonable to (1) consider it as a good-enough predictor and (2) use new data to further improve it via a training process.</p>
<p>We assume that (1) a data stream is generated by one environment <italic>V</italic><sub><italic>i</italic></sub> from the set &#x1D54D;; (2) that we want to learn and identify which model <italic>K</italic><sub><italic>i</italic></sub> &#x02208; &#x1D542; describes the current data stream, including if none of the current models describe the data; (3) the data stream produces a set of finite discrete or categorical outcomes. Therefore, we use hypothesis testing instead of model selection, so that we can determine when none of the existing models fit the data, allowing us to create a new model. To identify which specific <italic>V</italic><sub><italic>i</italic></sub> is generating the data, the problem can be formulated as selecting the corresponding model <italic>K</italic><sub><italic>i</italic></sub> that maximizes the likelihood
<disp-formula id="E5"><label>(5)</label><mml:math id="M25"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">argmax</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mover accent="true"><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mo class="qopname">^</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
where <italic>h</italic> is a limited time window of recent input data.</p>
<p>It is important to note that there are no guarantees that a current set of models, &#x1D542;, can accurately predict a corresponding set of environments &#x1D54D;. However, the key idea tested is: assuming we can select the most fitting model given by Equation (5), then we can associate a particular time window of the data stream and use it to improve the model <italic>K</italic><sub><italic>i</italic></sub> to maximize <inline-formula><mml:math id="M26"><mml:mover accent="true"><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula>. Therefore, such an approach can be used both to learn and to find the best set of models that fit a set of environments.</p>
<p>The data generated by the set of actions and observations in a POMDP as described in the previous section form a multinomial distribution. Thus, the &#x003C7;<sup>2</sup> test can be used to accept or reject the null hypothesis that a recent time window of data is generated by a given model. In the next sections, the procedure to compute the degrees of freedom and the &#x003C7;<sup>2</sup> <italic>p</italic>-values is presented.</p></sec>
<sec>
<title>3.2. Calculating Degrees of Freedom</title>
<p>The degrees of freedom (DF) of a statistical model <italic>K</italic> corresponds to the number of independent parameters in the model, and is required to perform statistical tests such as &#x003C7;<sup>2</sup>. Predictive models have in common the ability to predict the probability distribution of the next observation given a history or internal state. Note that we are not trying to find the number of independent parameters in the predictive agent, but in the underlying model the agent predicts. Let <inline-formula><mml:math id="M27"><mml:msup><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow></mml:math></inline-formula>. If, for all <inline-formula><mml:math id="M28"><mml:mi>o</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow><mml:mo>\</mml:mo><mml:msup><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> we know the value of <italic>P</italic>(<italic>o</italic> &#x02223; <italic>K</italic><sub><italic>i</italic></sub>), then
<disp-formula id="E6"><label>(6)</label><mml:math id="M29"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>&#x02223;</mml:mo><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow><mml:mo>\</mml:mo><mml:msup><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:munder></mml:mstyle><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>o</mml:mi><mml:mo>&#x02223;</mml:mo><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x02003;</mml:mtext><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
Therefore, the number of independent parameters for each prediction is <inline-formula><mml:math id="M30"><mml:mo>|</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow><mml:mo>|</mml:mo><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> as the final observation&#x00027;s probability can be inferred from the others. Each prediction may be independent in the predictive model, thus, we assume that the predictive model is an approximation to a statistical model (the underlying POMDP) with a number of independent states that is much smaller than the length of a history of data points. To estimate the number of independent states in the underlying model, we cluster together similar predictions in the predictive model. These can be clustered according to the predictive model&#x00027;s hidden states, the prediction of the next observation, or the prediction of several next observations.</p></sec>
<sec>
<title>3.3. Sampling and Clustering Probabilities</title>
<p>The adaptive model detection algorithm (AMD) keeps a history window <italic>W</italic> of up to <italic>L</italic> recent prediction observation pairs, where <italic>L</italic> is a parameter of the algorithm. The choice of <italic>L</italic> affects the behavior of the algorithm as shown in the results section with an analysis of different values for <italic>L</italic>.</p>
<p>Knowing how many times each prediction has been made is necessary to determine whether the environment behaves as the predictive model expects. To count the number of times different predictions are given by a learning model, it is necessary to cluster such predictions that are expressed as vectors with possibly slightly different probabilities values. Let <inline-formula><mml:math id="M31"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="-tex-caligraphic">C</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> be the set of all clusters made by AMD for the predictive model. For a given cluster <inline-formula><mml:math id="M32"><mml:mi>c</mml:mi><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="-tex-caligraphic">C</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>, let <inline-formula><mml:math id="M33"><mml:mover accent="true"><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula> be the mean of the distributions <inline-formula><mml:math id="M34"><mml:mi>P</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow><mml:mo>&#x02223;</mml:mo><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:mi>c</mml:mi></mml:math></inline-formula> in the cluster, and <inline-formula><mml:math id="M35"><mml:mover accent="true"><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> therefore be the mean probability of observing observation <italic>o</italic>.</p>
<p>AMD uses the DBSCAN clustering algorithm (Ester et al., <xref ref-type="bibr" rid="B9">1996</xref>) to form clusters of predictions given by a model. The scikit-learn (Pedregosa et al., <xref ref-type="bibr" rid="B23">2011</xref>) vanilla implementation of the algorithm is used. As the data changes over time, the DBSCAN algorithm may form clusters differently on each timestep. This does not pose an issue, as <inline-formula><mml:math id="M36"><mml:mover accent="true"><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula> can be recalculated at each timestep, meaning that even when the clusters change, the expected and observed frequencies will be similar in an accurate agent. DBSCAN does not include into clusters those outlier points which seem to not fit into larger clusters. Such a property is advantageous in our context because: (1) the mean distribution of a cluster <inline-formula><mml:math id="M37"><mml:mover accent="true"><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula> is therefore not affected by an outlier that was forced into it, and (2) clusters must be formed of a certain minimum size, the advantage of which is explained in section 3.5.</p></sec>
<sec>
<title>3.4. Calculating &#x003C7;<sup>2</sup> <italic>p</italic>-Value</title>
<p>To compute the &#x003C7;<sup>2</sup>, AMD counts as <italic>X</italic><sub><italic>c,o</italic></sub> the number of times in the history that each observation <italic>o</italic> follows predictions in each cluster <italic>c</italic> for all <italic>c</italic> &#x02208; <italic>C</italic> and <inline-formula><mml:math id="M38"><mml:mi>o</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow></mml:math></inline-formula>. The number of times each observation <italic>o</italic> is expected to follow predictions in a given cluster is given by <inline-formula><mml:math id="M39"><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>o</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>|</mml:mo><mml:mi>c</mml:mi><mml:mo>|</mml:mo><mml:mo>&#x000D7;</mml:mo><mml:mover accent="true"><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>o</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>.</p>
<p>Thus, from Pearson (<xref ref-type="bibr" rid="B22">1900</xref>),
<disp-formula id="E7"><label>(7)</label><mml:math id="M40"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>&#x003C7;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">C</mml:mi></mml:mrow></mml:mrow></mml:munder></mml:mstyle><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="-tex-caligraphic">O</mml:mi></mml:mrow></mml:mrow></mml:munder></mml:mstyle><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>o</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>o</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>o</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mtext>&#x02003;</mml:mtext><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
Effectively, &#x003C7;<sup>2</sup> measures how well the actual data matches the expected data, with higher values meaning that the observed data does not match the expected data well. For a general model, some values of <italic>X</italic><sub><italic>c,o</italic></sub> and <italic>E</italic><sub><italic>c,o</italic></sub> may be equal or close to 0, corresponding to impossible (or never observed) transitions. For these cases when <italic>X</italic><sub><italic>c,o</italic></sub> &#x0003D; <italic>E</italic><sub><italic>c,o</italic></sub> &#x0003D; 0, the value <inline-formula><mml:math id="M41"><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>o</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>o</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>o</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:math></inline-formula> is set to 0.</p>
<p>Knowing the &#x003C7;<sup>2</sup> value and DF allows us to find the <italic>p</italic>-value, which represents the probability of the observed data coming from the expected data distribution. This function is available in most statistical programs, code libraries, and toolkits as part of the &#x003C7;<sup>2</sup> implementation<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref>.</p></sec>
<sec>
<title>3.5. &#x003C7;<sup>2</sup> Testing for Adaptive Model Detection</title>
<p>Following the guidelines in (Yates et al., <xref ref-type="bibr" rid="B36">1999</xref>), a &#x003C7;<sup>2</sup> test is considered to be sufficiently accurate when &#x0201C;no more than 20% of the expected counts are less than five, and all individual expected counts are one or greater.&#x0201D; Accordingly, a minimum history length is necessary to be able to perform the test, and the longer the history, the more accurate the test is expected to be. Unfortunately, with an arbitrary large POMDP, even a long history does not guarantee that all possible prediction observation pairs have been seen. Additionally, while a long history length makes <italic>p</italic>-values more accurate, a long history means that the assessment of the probability for a model can be done only over a long period of time, potentially losing granularity on the precise point of the transition.</p>
<p>Even with a long history length, if the agent encounters a sequence of observations it does not expect, it may produce an internal state or prediction unlike any it has generated before. Due to this, the cluster containing the unusual state or prediction could be small enough that the expected number of times a given observation follows states in the cluster is lower than 1. Conveniently, DBSCAN has a minimum cluster size, so such small clusters can be avoided entirely, unless of course the probability of observing a particular observation from states in a cluster are very low.</p>
<p>An AMD that tests data against multiple predictive models <italic>K</italic><sub>1</sub>, <italic>K</italic><sub>2</sub>, &#x02026;, <italic>K</italic><sub><italic>n</italic></sub> keeps track of the <italic>p</italic>-value of each one, and uses this tracked <italic>p</italic>-value to determine which environment it is most likely to be in. This allows AMD to select which model is trained at any point in time in a statistically motivated way, contributing to explainable decisions in AI. Additionally, detecting which environments are likely at given points in time opens up the possibility of applying a different policy based on the current environment.</p>
<p>Note that as long as the <italic>p</italic>-value is above the threshold at which the null hypothesis is rejected, it does not matter whether the value is low, high, or fluctuating. The AMD algorithm is summarized with pseudo code in Algorithm 1.</p></sec></sec>
<sec id="s4">
<title>4. Simulation Results</title>
<p>The effectiveness of the algorithm in a variety of settings is tested with computer simulations. The set of environments is introduced in section 4.1. The effect of the history length parameter <italic>L</italic> on the detection speed and stability is investigated in section 4.2. In section 4.3, the algorithm is extended to demonstrate how labeling incoming data can be used to continually train separate models, and thus avoid catastrophic forgetting.</p>
<sec>
<title>4.1. Chosen Environments</title>
<p>The proposed algorithm was tested on a set of POMDPs of various size and complexity. The first set of problems (<xref ref-type="fig" rid="F1">Figures 1A&#x02013;D</xref>) is a series of uncontrolled POMDP environments, i.e., Markov chain environments, with only 2 observations (as there are more states than observations, the observation is given by the color of the state in <xref ref-type="fig" rid="F1">Figure 1</xref>). These environments appear simple at first, however, they have different states and transition probabilities, and, due to their stochastic nature, some environments could be mistaken for others based on the data generated by exploration. For example, environment C creates a data stream that can be produced by environment D. However, when interacting with the environment over a longer period, the observations reveal the data stream is more likely to be generated by environment C than environment D.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Set of POMDP environments. States are represented with circles, actions with squares, and transition probabilities with numbers on arrows. The color of the state corresponds to the observation. Partial observability derives from the fact that different states provide the same observation. The first set <bold>(A&#x02013;D)</bold> are simple Markov chains where transitions occurs without actions and probabilities indicated near the transition arrows. The second set <bold>(E1,E1)</bold> are more complex problems where actions determines different paths in a cycle but have deterministic transitions. Environments <bold>(F1,F2)</bold> are POMDPs with some deterministic and some stochastic transitions.</p></caption>
<graphic xlink:href="fnbot-14-578675-g0001.tif"/>
</fig>
<p>The next set of environments, <xref ref-type="fig" rid="F1">Figures 1E1,E2</xref>, represents a decision process originating in S0 where one sequence of actions takes the agent to S7, and all other sequences lead to S9. The challenge in this set is that the observations do not reveal the distance from S0, and thus make it hard for an agent to locate itself along the graph as it progresses from left to right. E1 and E2 have two further variations, E3 and E4 (not shown). E3 is the same as E1 but the transitions from S1 have inverted actions. Similarly, E4 is the same as E2 but transitions from S1 have inverted actions. These four environments are very similar in structure, but they require different policies to be traversed from <italic>S</italic>0 to <italic>S</italic>7.</p>
<p>Finally, environments <xref ref-type="fig" rid="F1">Figures 1F1,F2</xref> have most of their transition probabilities being exactly the same. This means that transitions in the data stream distinguishing the two environments occur less frequently than in some other environments.</p></sec>
<sec>
<title>4.2. Speed and Reliability of Detections: Impact of History Length</title>
<p>The experiments in this section assess overall stability of the detection and the impact of the history length expressed by the parameter <italic>L</italic>. <xref ref-type="fig" rid="F2">Figure 2</xref> shows the effect of different history lengths when tracking the probability of an accurate PSR model for environment C. In <xref ref-type="fig" rid="F2">Figure 2</xref> (left), the values of <italic>L</italic> for 40, 60, 80, 100, and 120 are shown. In all cases, the AMD shows consistent <italic>p</italic>-values of 0 and 1, indicating that the model can confidently determine which model the data stream belongs to. The longer the history, the slower the change in <italic>p</italic>-value, confirming the intuitive notion that longer histories require more time steps to reveal a change in the environment. When assessed on the stochastic environments A and B (<xref ref-type="fig" rid="F2">Figure 2</xref>, right), the dropping <italic>p</italic>-values indicate that stochasticity is a significant confounding factor in the detection of the environment.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>AMD <italic>p</italic>-values with deterministic and stochastic environments. <bold>(Left)</bold> Tracking of environment C with different history lengths (parameter <italic>L</italic>). C is deterministic and the <italic>p</italic>-values are consistent at 0 and 1, but the time taken to detect the change in environment varies with the history length. <bold>(Right)</bold> AMD <italic>p</italic>-values for the model A and B with the stochastic environments A and B and <italic>L</italic> &#x0003D; 120. Tracking stochastic environments results in <italic>p</italic>-values predicting the wrong environment more frequently.</p></caption>
<graphic xlink:href="fnbot-14-578675-g0002.tif"/>
</fig>
<p>It can be concluded that a small <italic>L</italic> is advantageous to detect changes more readily only if the environment is predominantly deterministic. When the environment has stochastic transitions, a longer history might be necessary to guarantee the stability of the detection. To further assess these dynamics, <xref ref-type="fig" rid="F3">Figure 3</xref> shows the comparison of a short and a long history window (<italic>L</italic> &#x0003D; 20 and <italic>L</italic> &#x0003D; 120) on the deterministic environment C and the stochastic environment D. The AMD <italic>p</italic>-values show that while C can be tracked reliably with both <italic>L</italic> &#x0003D; 20 and <italic>L</italic> &#x0003D; 120, environment D (orange line) causes the <italic>p</italic>-value to oscillate, although with less amplitude, even with <italic>L</italic> &#x0003D; 120.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>AMD <italic>p</italic>-values for the models of environments C and D. <bold>(Left)</bold> With a short history length (<italic>L</italic> = 20), the <italic>p</italic>-values for the environment D occasionally indicate the data matches environment C, and not environment D. <bold>(Right)</bold> With a longer history length (<italic>L</italic> = 220), the <italic>p</italic>-value for model D becomes more stable, not dropping below 0.3.</p></caption>
<graphic xlink:href="fnbot-14-578675-g0003.tif"/>
</fig>
<p>Other factors that can affect the stability of the <italic>p</italic>-values could include the complexity and the similarity with other environments. In <xref ref-type="fig" rid="F4">Figure 4</xref> (left), the tracking of the environments E1 and E2 is shown with <italic>L</italic> varying from 60 to 220. The faster settings (<italic>L</italic> &#x0003D; 60 and <italic>L</italic> &#x0003D; 100) appear to detect E2 when still in E1. With <italic>L</italic> &#x0003D; 140, the detection becomes more reliable, and with <italic>L</italic> &#x0003D; 180 and <italic>L</italic> &#x0003D; 220 the detection is accurate, although slower after step 2,000 to detect the transition from E1 to E2. <xref ref-type="fig" rid="F4">Figure 4</xref> (right) shows the <italic>p</italic>-values for all four E models tracked simultaneously. Despite the similarity of these four environments, the <italic>p</italic>-values show high confidence in determining which model currently matches the data stream. A similar result is also observed in <xref ref-type="fig" rid="F5">Figure 5</xref> where the environments of the set F are tested. The stochasticity in this set does not affect significantly the stability of the <italic>p</italic>-values. This is because the environment is primarily deterministic, and although the points where the data streams differ do not occur often, the <italic>p</italic>-value drops significantly when they do occur.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Stability of detections for the deterministic set of models E. The agents traversing this environment use a random policy, where actions are chosen from the set of possible actions which can be taken at each state. <bold>(Left)</bold> The environment transitions from E1 to E2. The graph shows <italic>p</italic>-values for the environment E2. <bold>(Right)</bold> <italic>p</italic>-values for all four models are shown while all four environments alternate (<italic>L</italic> = 220).</p></caption>
<graphic xlink:href="fnbot-14-578675-g0004.tif"/>
</fig>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p><italic>p</italic>-values for the models of the set F with <italic>L</italic> = 120. Agents traversing this environment utilize a random policy.</p></caption>
<graphic xlink:href="fnbot-14-578675-g0005.tif"/>
</fig>
</sec>
<sec>
<title>4.3. Avoiding Catastrophic Forgetting With Continual Learning of Multiple Models</title>
<p>The ability to detect which environment the current stream of data belongs to allows the system to train different models independently, and thus implement continual learning and avoid catastrophic forgetting. The scenario devised in this section is when two unknown environments X and Y alternate while a learning system is trying to learn them from the data stream. This condition is particularly challenging because the data stream is generated by two different environments, both unknown. Therefore, there is an obvious bootstrap problem. How can the AMD know when the environment changes before any environment has been learned?</p>
<p>A reasonable assumption to overcome this problem is to assume that the data stream is initially generated by a single environment for a certain amount of time, so that one single model can be at least partially trained on the initial data stream.</p>
<sec>
<title>4.3.1. AMD With Constrained Gradient PSRs</title>
<p>Two simple but highly stochastic environments, shown in <xref ref-type="fig" rid="F6">Figure 6</xref>, are chosen for this test as depicted in <xref ref-type="fig" rid="F7">Figure 7</xref>. The learning setup for this first test uses the constrained gradient PSR learner. It starts to learn a first model while environment X produces data for the first 10k steps. When the <italic>p</italic>-value suddenly drops and remains low at step 10,000, the AMD clearly indicates that the first model is not valid anymore. Thus, the new data is used to train a new model. A similar process occurs for the following environment changes: the PSR with the highest <italic>p</italic>-value is trained, and the other is left idle. The AMD continues to track the probabilities of each model. Effectively, each chunk of data that is identified by the AMD as belonging to one model is used to train that model only and thus enables continual learning and prevents catastrophic forgetting.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>Environments X and Y. These environments have only two states, but are stochastic. Environment X is more likely to remain in its current state at each time step, whereas environment Y is more likely to transition between states.</p></caption>
<graphic xlink:href="fnbot-14-578675-g0006.tif"/>
</fig>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>Avoiding catastrophic forgetting with AMD on PSR learners. The unknown environments X and Y alternate overtime. <bold>(Top)</bold> The performance is measured as the negative log of the prediction error of each model. A single PSR (black line) is trained on the data stream and it learns an average of the two environments. The azure continuous line and the orange dash line show the AMD-guided learning: first X is learned, and when the data switches to environment Y, a new model Y is also learned. Subsequently, data points originating from the two environments are used to further improve each model separately. <bold>(Bottom)</bold> The AMD <italic>p</italic>-values for the AMD-guided learners are shown during the process.</p></caption>
<graphic xlink:href="fnbot-14-578675-g0007.tif"/>
</fig>
<p>As a baseline, we ran the constrained gradient PSR learning algorithm with the same parameters, but without AMD detecting switches in the environment. The agent learns well until time step 10,000. At time step 10,000&#x02013;20,000, the agent also learns the second environment well, although it is not able to reach the same performance as the PSR with AMD. From time step 20,000 onwards, it is clear that the agent has experienced catastrophic forgetting, as each time the environment is switched, the performance decreases dramatically.</p>
<p>In these experiments, the error is calculated as the average prediction error (the difference between the predicted probabilities of the next observation and the actual probabilities of the next observation) over 10,000 time steps in an independent data stream.</p></sec>
<sec>
<title>4.3.2. AMD With Neural Network Learners</title>
<p>To validate the AMD with the neural network learning models, we employed a simple three-layer network whose details are specified in the <xref ref-type="supplementary-material" rid="SM1">Appendix 6.1.2</xref>. <xref ref-type="fig" rid="F8">Figure 8</xref> shows the performance and <italic>p</italic>-values when the AMD is applied in combination with the neural network models.</p>
<fig id="F8" position="float">
<label>Figure 8</label>
<caption><p>Avoiding catastrophic forgetting with AMD on neural networks models. This experiment is the same as that in the previous figure, but uses neural network learners instead of the constrained gradient algorithm. <bold>(Top)</bold> The performance of the AMD-enabled models is shown in the environments X and Y. The baseline is capable of continuous learning, but forgets the previous environment each time there is a switch. <bold>(Bottom)</bold> The AMD <italic>p</italic>-values for the AMD-guided learner are shown during the process.</p></caption>
<graphic xlink:href="fnbot-14-578675-g0008.tif"/>
</fig>
<p>We observe the same learning dynamics that were obtained with the PSR learner, although the neural network learner appears to achieve slightly better performance. The single model (baseline) shows the typical learning curves when different tasks or environments are learned sequentially, with catastrophic forgetting occurring each time the environment switches. The AMD instead can accurately determine when the environment switches and use the data to train two different models.</p></sec></sec></sec>
<sec sec-type="discussion" id="s5">
<title>5. Discussion</title>
<p>The results presented in the previous section show that the idea to use statistical tests to determine the best fitting model to label data is a promising venue of research. Various aspects of the algorithm and of the experimental results prompt interesting questions.</p>
<p>The first important aspect that was investigated in section 4.2, the impact of history length, shows that the readiness in detection and stability are opposed objectives that need to be balanced. However, stochasticity appears to be the main factor that requires longer histories to guarantee stability. We speculate that, while high levels of stochasticity are an obvious obstacle to learning, it would be possible to learn also such features of the environments and incorporate them in future developments of the AMD. One possibility is to introduce an adaptive history length that could reduce if an environment is predominantly deterministic, and increase in length when highly stochastic transitions require more data points to acquire meaningful statistics.</p>
<p>A second observation is that the dynamic properties of a series of POMDPs can include both sudden changes and slow progressive drifts. In the case of drifts that progressively increase the distance between the distributions of the model and the environment, there will be a race between the adaptation speed of a learning algorithm and the AMD <italic>p</italic>-values. If the learning algorithm is fast enough to track the drift, the AMD will maintain a high <italic>p</italic>-value, thus maintaining confidence in the current model. This condition, however, would lead to progressive forgetting of the original distribution. If, on the other hand, the drift in the environment is faster than the speed at which the learning algorithm can adapt, the AMD will see the corresponding <italic>p</italic>-value drop and either select a different model, or create a new model to learn the new data distribution. While we did not investigate these conditions, the problem of deciding whether an environment is drifting to a new distribution, or changing significantly to warrant the instantiation of a new model, is a relevant aspect of lifelong learning worth of future studies.</p>
<p>The AMD is intended as a framework that is independent of the specific learning algorithm used to learn a model. However, it is worth pointing out that the AMD is limited by the underlying learning model. In fact, while the <italic>p</italic>-values of sub-optimal models could be low, and thus lead to model rejection or further learning, there are cases in which this is not true, leading to simpler models having higher <italic>p</italic>-values than more accurate ones. In fact, a <italic>p</italic>-value could be high when the environment is more complex than the model. Consider, e.g., an environment that generates a data stream of observations 0, 0, 1, 1, 0, 0, 1, 1, &#x02026;, where each observation 1 or 0 occurs twice in a row. A model that predicts that after each 1, the next observation will be 0 or 1 with equal probability will score a high <italic>p</italic>-value although a better model could be learned. In short, the AMD <italic>p</italic>-values might not always provide the best metrics to assess the quality of a model. While choosing the simplest model to fit the data might prove effective to prevent overfitting and agrees with the Occam&#x00027;s razor principle, further analysis might reveal how best to integrate the AMD algorithm with specific learning methods.</p>
<p>Given a set of models, one interesting question is what approach is more effective when a new model is necessary to learn a new environment. In the context of lifelong learning, a desirable property is that of exploiting previous knowledge to accelerate the learning of new tasks. It is possible that the AMD could facilitate such a forward transfer by instantiating a new model that matches some properties of the new data. While this problem was not touched in this study, the AMD may provide useful statistical insights to inform the creation of new models.</p>
<p>The set of problems proposed in this study appears simple at first. However, it is worth noting that partial observability and stochasticity make it difficult to derive the correct model from observations even in relatively simple environments. Additionally, the complexity of an environment might derive from a large input space, e.g., when using raw images in a navigation task. We speculate that the use of the AMD in combination with large neural models for feature extraction could allow the extension of this method to more complex problems.</p>
<p>Finally, it is important to note that this study introduces the idea of model selection from statistics in the domain of dynamic POMDPs without rewards. We could not identify existing methods that could be used for a direct performance comparison. However, with the addition of a reward function, this study could be extended to incorporate a policy component, and thus place the approach in the field of reinforcement learning. Given the large amount of research in reinforcement learning, this extension would open several exciting research directions and comparisons with recent RL and meta-RL approaches.</p></sec>
<sec sec-type="conclusions" id="s6">
<title>6. Conclusion</title>
<p>This study introduces an algorithm that aims to address a limitation of many current learning systems: the inability to monitor a non-stationary data stream while learning from it. The proposed system, named adaptive model detection (AMD), monitors the data stream generated by partially observable Markov decision processes with the aim to assess the probability of the data fitting a given model. Statistical tests determine (1) whether the null hypothesis that a current model produces the data can be accepted or rejected and (2) which specific model from a set is more accurate to predict a recent window of data. The novel algorithm was tested with two types of predictive models, PSRs and neural networks. The simulations show that the approach is not only useful for quickly adapting to changes in an environment, but can also be useful to associate a stream of data to a particular environment. By doing so, it is possible to continuously train different models for different environments, and thus prevent catastrophic forgetting while learning multiple environments. The approach can be extended to address a wide set of problems beyond the limited scope of the environments tested here. The method could be valuable in AI applications where critical decisions require an evidence-based and justifiable process. When multiple environments are presented sequentially and require incremental learning without labels, rewards, or signals that a change has occurred, the approach presented can be used to implement continuous lifelong learning abilities.</p></sec>
<sec sec-type="data-availability-statement" id="s7">
<title>Data Availability Statement</title>
<p>Publicly available data were analyzed in this study. This data can be found here: <ext-link ext-link-type="uri" xlink:href="https://github.com/JupiLogy/adaptive-model-detection">https://github.com/JupiLogy/adaptive-model-detection</ext-link>.</p></sec>
<sec id="s8">
<title>Author Contributions</title>
<p>JD developed the novel algorithm, wrote the computer code and performed the experiments. JD and AS devised the research plan and methods, analyzed the results, plotted the graphs and wrote the paper. PL and EB-I performed and analyzed experimental results. PP, SK, and PK contributed to the formulation of the research hypotheses. HS and SK provided support for the statistical method. All authors contributed to writing the final manuscript.</p></sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of Interest</title>
<p>SK and PP were employed by HRL Laboratories. The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
</body>
<back>
<sec sec-type="supplementary-material" id="s9">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fnbot.2020.578675/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fnbot.2020.578675/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Presentation_1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Akaike</surname> <given-names>H.</given-names></name></person-group> (<year>1974</year>). <article-title>A new look at the statistical model identification</article-title>. <source>IEEE Trans. Automat. Control</source> <volume>19</volume>, <fpage>716</fpage>&#x02013;<lpage>723</lpage>. <pub-id pub-id-type="doi">10.1109/TAC.1974.1100705</pub-id></citation></ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bellman</surname> <given-names>R.</given-names></name></person-group> (<year>1957</year>). <article-title>A Markovian decision process</article-title>. <source>Indiana Univ. Math. J</source>. <volume>6</volume>, <fpage>679</fpage>&#x02013;<lpage>684</lpage>. <pub-id pub-id-type="doi">10.1512/iumj.1957.6.56038</pub-id></citation></ref>
<ref id="B3">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Bishop</surname> <given-names>C. M.</given-names></name></person-group> (<year>1995</year>). <source>Neural Networks for Pattern Recognition</source>. <publisher-name>Oxford University Press</publisher-name>. <pub-id pub-id-type="doi">10.1201/9781420050646.ptb6</pub-id></citation></ref>
<ref id="B4">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Boots</surname> <given-names>B.</given-names></name> <name><surname>Gretton</surname> <given-names>A.</given-names></name> <name><surname>Gordon</surname> <given-names>G. J.</given-names></name></person-group> (<year>2013</year>). <article-title>Hilbert space embeddings of predictive state representations</article-title>, in <source>Uncertainty in Artificial Intelligence - Proceedings of the 29th Conference, UAI 2013</source> (<publisher-loc>Bellevue, WA</publisher-loc>), <fpage>92</fpage>&#x02013;<lpage>101</lpage>.</citation></ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chuk</surname> <given-names>T.</given-names></name> <name><surname>Chan</surname> <given-names>A. B.</given-names></name> <name><surname>Shimojo</surname> <given-names>S.</given-names></name> <name><surname>Hsiao</surname> <given-names>J. H.</given-names></name></person-group> (<year>2020</year>). <article-title>Eye movement analysis with switching hidden Markov models</article-title>. <source>Behav. Res. Methods</source> <volume>52</volume>, <fpage>1026</fpage>&#x02013;<lpage>1043</lpage>. <pub-id pub-id-type="doi">10.3758/s13428-019-01298-y</pub-id><pub-id pub-id-type="pmid">31712999</pub-id></citation></ref>
<ref id="B6">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Cox</surname> <given-names>D. R.</given-names></name></person-group> (<year>2006</year>). <source>Principles of Statistical Inference</source>. <publisher-loc>Cambridge</publisher-loc>: <publisher-name>Cambridge University Press</publisher-name>. <pub-id pub-id-type="doi">10.1017/CBO9780511813559</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Downey</surname> <given-names>C.</given-names></name> <name><surname>Hefny</surname> <given-names>A.</given-names></name> <name><surname>Li</surname> <given-names>B.</given-names></name> <name><surname>Boots</surname> <given-names>B.</given-names></name> <name><surname>Gordon</surname> <given-names>G.</given-names></name></person-group> (<year>2017</year>). <article-title>Predictive state recurrent neural networks</article-title>, in <source>Advances in Neural Information Processing Systems</source>, eds <person-group person-group-type="editor"><name><surname>Guyon</surname> <given-names>I.</given-names></name> <name><surname>Luxburg</surname> <given-names>U. V.</given-names></name> <name><surname>Bengio</surname> <given-names>S.</given-names></name> <name><surname>Wallach</surname> <given-names>H.</given-names></name> <name><surname>Fergus</surname> <given-names>R.</given-names></name> <name><surname>Vishwanathan</surname> <given-names>S.</given-names></name> <name><surname>Garnett</surname> <given-names>R.</given-names></name></person-group> (<publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>Curran Associates</publisher-name>), <fpage>6054</fpage>&#x02013;<lpage>6065</lpage>. <pub-id pub-id-type="pmid">31762098</pub-id></citation></ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Doya</surname> <given-names>K.</given-names></name> <name><surname>Samejima</surname> <given-names>K.</given-names></name> <name><surname>Katagiri</surname> <given-names>K.-I.</given-names></name> <name><surname>Kawato</surname> <given-names>M.</given-names></name></person-group> (<year>2002</year>). <article-title>Multiple model-based reinforcement learning</article-title>. <source>Neural Comput</source>. <volume>14</volume>, <fpage>1347</fpage>&#x02013;<lpage>1369</lpage>. <pub-id pub-id-type="doi">10.1162/089976602753712972</pub-id><pub-id pub-id-type="pmid">12020450</pub-id></citation></ref>
<ref id="B9">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ester</surname> <given-names>M.</given-names></name> <name><surname>Kriegel</surname> <given-names>H.-P.</given-names></name> <name><surname>Sander</surname> <given-names>J.</given-names></name> <name><surname>Xu</surname> <given-names>X.</given-names></name></person-group> (<year>1996</year>). <article-title>A density-based algorithm for discovering clusters in large spatial databases with noise</article-title>, in <source>Proceedings of the Second International Conference on Knowledge Discovery and Data Mining, KDD&#x00027;96</source> (<publisher-loc>Portland, OR</publisher-loc>: <publisher-name>AAAI Press</publisher-name>), <fpage>226</fpage>&#x02013;<lpage>231</lpage>.</citation></ref>
<ref id="B10">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Finn</surname> <given-names>C.</given-names></name> <name><surname>Abbeel</surname> <given-names>P.</given-names></name> <name><surname>Levine</surname> <given-names>S.</given-names></name></person-group> (<year>2017</year>). <article-title>Model-agnostic meta-learning for fast adaptation of deep networks</article-title>, in <source>International Conference on Machine Learning</source> (<publisher-loc>Sydney, NSW</publisher-loc>).</citation></ref>
<ref id="B11">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Geisser</surname> <given-names>S.</given-names></name></person-group> (<year>1993</year>). <source>Predictive Inference</source>, <volume>Vol. 55</volume>. <publisher-name>New York, NY</publisher-name>: <publisher-name>CRC Press</publisher-name>.</citation></ref>
<ref id="B12">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Hamilton</surname> <given-names>W. L.</given-names></name> <name><surname>Fard</surname> <given-names>M. M.</given-names></name> <name><surname>Pineau</surname> <given-names>J.</given-names></name></person-group> (<year>2013</year>). <article-title>Modelling sparse dynamical systems with compressed predictive state representations</article-title>, in <source>30th International Conference on Machine Learning, ICML 2013</source> (<publisher-loc>Atlanta, GA</publisher-loc>), <fpage>178</fpage>&#x02013;<lpage>186</lpage>.</citation></ref>
<ref id="B13">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>H&#x000F6;ffken</surname> <given-names>M.</given-names></name> <name><surname>Oberhoff</surname> <given-names>D.</given-names></name> <name><surname>Kolesnik</surname> <given-names>M.</given-names></name></person-group> (<year>2009</year>). <article-title>Switching hidden Markov models for learning of motion patterns in videos</article-title>, in <source>Lecture Notes in Computer Science</source>, eds <person-group person-group-type="editor"><name><surname>Alippi</surname> <given-names>C.</given-names></name> <name><surname>Polycarpou</surname> <given-names>M.</given-names></name> <name><surname>Panayiotou</surname> <given-names>C.</given-names></name> <name><surname>Ellinas</surname> <given-names>G.</given-names></name></person-group> (<publisher-loc>Limassol</publisher-loc>: <publisher-name>Springer, Berlin, Heidelberg</publisher-name>), <fpage>757</fpage>&#x02013;<lpage>766</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-642-04274-4_78</pub-id></citation></ref>
<ref id="B14">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Kochenderfer</surname> <given-names>M. J.</given-names></name> <name><surname>Amato</surname> <given-names>C.</given-names></name> <name><surname>Chowdhary</surname> <given-names>G.</given-names></name> <name><surname>How</surname> <given-names>J. P.</given-names></name> <name><surname>Davison Reynolds</surname> <given-names>H. J.</given-names></name> <name><surname>Thornton</surname> <given-names>J. R.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Optimized airborne collision avoidance</article-title>, in <source>Decision Making Under Uncertainty: Theory and Application</source> (<publisher-loc>MIT Press</publisher-loc>), <fpage>249</fpage>&#x02013;<lpage>276</lpage>.</citation></ref>
<ref id="B15">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Kullback</surname> <given-names>S.</given-names></name></person-group> (<year>1997</year>). <source>Information Theory and Statistics</source>. <publisher-loc>New York, NY</publisher-loc>: <publisher-name>Courier Corporation</publisher-name>.</citation></ref>
<ref id="B16">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lecarpentier</surname> <given-names>E.</given-names></name> <name><surname>Rachelson</surname> <given-names>E.</given-names></name></person-group> (<year>2019</year>). <article-title>Non-stationary Markov decision processes, a worst-case approach using model-based reinforcement learning</article-title>, in <source>Advances in Neural Information Processing Systems 32</source>, eds <person-group person-group-type="editor"><name><surname>Wallach</surname> <given-names>H.</given-names></name> <name><surname>Larochelle</surname> <given-names>H.</given-names></name> <name><surname>Beygelzimer</surname> <given-names>A.</given-names></name> <name><surname>d&#x00027;Alch&#x000E9;-Buc</surname> <given-names>F.</given-names></name> <name><surname>Fox</surname> <given-names>E.</given-names></name> <name><surname>Garnett</surname> <given-names>R.</given-names></name></person-group> (<publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>), <fpage>7216</fpage>&#x02013;<lpage>7225</lpage>.</citation></ref>
<ref id="B17">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lehmann</surname> <given-names>E. L.</given-names></name> <name><surname>Romano</surname> <given-names>J. P.</given-names></name></person-group> (<year>2006</year>). <source>Testing Statistical Hypotheses</source>. <publisher-loc>New York, NY</publisher-loc>: <publisher-name>Springer Science &#x00026; Business Media</publisher-name>.</citation></ref>
<ref id="B18">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Littman</surname> <given-names>M. L.</given-names></name> <name><surname>Sutton</surname> <given-names>R. S.</given-names></name> <name><surname>Singh</surname> <given-names>S.</given-names></name></person-group> (<year>2001</year>). <article-title>Predictive representations of state</article-title>, in <source>Advances in Neural Information Processing Systems</source>, eds <person-group person-group-type="editor"><name><surname>Dietterich</surname> <given-names>T.</given-names></name> <name><surname>Becker</surname> <given-names>S.</given-names></name> <name><surname>Ghahramani</surname> <given-names>Z.</given-names></name></person-group> (<publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>MIT Press</publisher-name>), <fpage>1555</fpage>&#x02013;<lpage>1561</lpage>.</citation></ref>
<ref id="B19">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Zhu</surname> <given-names>H.</given-names></name> <name><surname>Zeng</surname> <given-names>Y.</given-names></name> <name><surname>Dai</surname> <given-names>Z.</given-names></name></person-group> (<year>2016</year>). <article-title>Learning predictive state representations via Monte-Carlo tree search</article-title>, in <source>IJCAI International Joint Conference on Artificial Intelligence</source> (<publisher-loc>New York, NY</publisher-loc>), <fpage>3192</fpage>&#x02013;<lpage>3198</lpage>.</citation></ref>
<ref id="B20">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>McCracken</surname> <given-names>P.</given-names></name> <name><surname>Bowling</surname> <given-names>M.</given-names></name></person-group> (<year>2005</year>). <article-title>Online discovery and learning of predictive state representations</article-title>, in <source>Advances in Neural Information Processing Systems</source>, eds <person-group person-group-type="editor"><name><surname>Weiss</surname> <given-names>Y.</given-names></name> <name><surname>Sch&#x000F6;lkopf</surname> <given-names>B.</given-names></name> <name><surname>Platt</surname> <given-names>J.</given-names></name></person-group> (<publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>Curran Associates</publisher-name>), <fpage>875</fpage>&#x02013;<lpage>882</lpage>.</citation></ref>
<ref id="B21">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Nagabandi</surname> <given-names>A.</given-names></name> <name><surname>Kahn</surname> <given-names>G.</given-names></name> <name><surname>Fearing</surname> <given-names>R. S.</given-names></name> <name><surname>Levine</surname> <given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>Neural network dynamics for model-based deep reinforcement learning with model-free fine-tuning</article-title>, in <source>2018 IEEE International Conference on Robotics and Automation (ICRA)</source> (<publisher-loc>Brisbane, QLD</publisher-loc>), <fpage>7559</fpage>&#x02013;<lpage>7566</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2018.8463189</pub-id></citation></ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pearson</surname> <given-names>K.</given-names></name></person-group> (<year>1900</year>). <article-title>On the criterion that a given system of deviations from the probable in the case of a correlated system of variables is such that it can be reasonably supposed to have arisen from random sampling</article-title>. <source>Lond. Edinb. Dubl. Phil. Mag.</source> <volume>50</volume>, <fpage>157</fpage>&#x02013;<lpage>175</lpage>. <pub-id pub-id-type="doi">10.1080/14786440009463897</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pedregosa</surname> <given-names>F.</given-names></name> <name><surname>Varoquaux</surname> <given-names>G.</given-names></name> <name><surname>Gramfort</surname> <given-names>A.</given-names></name> <name><surname>Michel</surname> <given-names>V.</given-names></name> <name><surname>Thirion</surname> <given-names>B.</given-names></name> <name><surname>Grisel</surname> <given-names>O.</given-names></name> <etal/></person-group>. (<year>2011</year>). <article-title>Scikit-learn: machine learning in Python</article-title>. <source>J. Mach. Learn. Res</source>. <volume>12</volume>, <fpage>2825</fpage>&#x02013;<lpage>2830</lpage>. <pub-id pub-id-type="doi">10.5555/1953048.2078195</pub-id></citation></ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rabiner</surname> <given-names>L. R.</given-names></name></person-group> (<year>1989</year>). <article-title>A tutorial on hidden markov models and selected applications in speech recognition</article-title>. <source>Proc. IEEE</source> <volume>77</volume>, <fpage>257</fpage>&#x02013;<lpage>286</lpage>. <pub-id pub-id-type="doi">10.1109/5.18626</pub-id></citation></ref>
<ref id="B25">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Rakelly</surname> <given-names>K.</given-names></name> <name><surname>Zhou</surname> <given-names>A.</given-names></name> <name><surname>Finn</surname> <given-names>C.</given-names></name> <name><surname>Levine</surname> <given-names>S.</given-names></name> <name><surname>Quillen</surname> <given-names>D.</given-names></name></person-group> (<year>2019</year>). <article-title>Efficient off-policy meta-reinforcement learning via probabilistic context variables</article-title>, in <source>International Conference on Machine Learning</source> (<publisher-loc>Long Beach, CA</publisher-loc>), <fpage>5331</fpage>&#x02013;<lpage>5340</lpage>.</citation></ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rao</surname> <given-names>R. P.</given-names></name></person-group> (<year>2010</year>). <article-title>Decision making under uncertainty: a neural model based on partially observable Markov decision processes</article-title>. <source>Front. Comput. Neurosci</source>. <volume>4</volume>:<fpage>146</fpage>. <pub-id pub-id-type="doi">10.3389/fncom.2010.00146</pub-id><pub-id pub-id-type="pmid">21152255</pub-id></citation></ref>
<ref id="B27">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Rosencrantz</surname> <given-names>M.</given-names></name> <name><surname>Gordon</surname> <given-names>G.</given-names></name> <name><surname>Thrun</surname> <given-names>S.</given-names></name></person-group> (<year>2004</year>). <article-title>Learning low dimensional predictive representations</article-title>, in <source>Proceedings, Twenty-First International Conference on Machine Learning, ICML 2004</source> (<publisher-loc>Banff, AB</publisher-loc>), <fpage>695</fpage>&#x02013;<lpage>702</lpage>. <pub-id pub-id-type="doi">10.1145/1015330.1015441</pub-id></citation></ref>
<ref id="B28">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Rothfuss</surname> <given-names>J.</given-names></name> <name><surname>Lee</surname> <given-names>D.</given-names></name> <name><surname>Clavera</surname> <given-names>I.</given-names></name> <name><surname>Asfour</surname> <given-names>T.</given-names></name> <name><surname>Abbeel</surname> <given-names>P.</given-names></name></person-group> (<year>2018</year>). <article-title>Promp: Proximal meta-policy search</article-title>, in <source>International Conference on Learning Representations</source> (<publisher-loc>Vancouver, BC</publisher-loc>).</citation></ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Schmidhuber</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>Deep learning in neural networks: an overview</article-title>. <source>Neural Netw</source>. <volume>61</volume>, <fpage>85</fpage>&#x02013;<lpage>117</lpage>. <pub-id pub-id-type="doi">10.1016/j.neunet.2014.09.003</pub-id><pub-id pub-id-type="pmid">25462637</pub-id></citation></ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Schwarz</surname> <given-names>G.</given-names></name></person-group> (<year>1978</year>). <article-title>Estimating the dimension of a model</article-title>. <source>Ann. Stat</source>. <volume>6</volume>, <fpage>461</fpage>&#x02013;<lpage>464</lpage>.</citation></ref>
<ref id="B31">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Shani</surname> <given-names>G.</given-names></name> <name><surname>Brafman</surname> <given-names>R. I.</given-names></name> <name><surname>Shimony</surname> <given-names>S. E.</given-names></name></person-group> (<year>2005</year>). <article-title>Model-based online learning of POMDPs</article-title>, in <source>Proceedings of 16th European Conference on Machine Learning</source>, eds <person-group person-group-type="editor"><name><surname>Gama</surname> <given-names>J.</given-names></name> <name><surname>Camacho</surname> <given-names>R.</given-names></name> <name><surname>Brazdil</surname> <given-names>P. B.</given-names></name> <name><surname>Jorge</surname> <given-names>A. M.</given-names></name> <name><surname>Torgo</surname> <given-names>L.</given-names></name></person-group> (<publisher-loc>Porto; Berlin; Heidelberg</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>353</fpage>&#x02013;<lpage>364</lpage>. <pub-id pub-id-type="doi">10.1007/11564096_35</pub-id></citation></ref>
<ref id="B32">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Sutton</surname> <given-names>R. S.</given-names></name> <name><surname>Barto</surname> <given-names>A. G.</given-names></name></person-group> (<year>2018</year>). <source>Reinforcement Learning: An Introduction</source>. <publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>MIT Press</publisher-name>.</citation></ref>
<ref id="B33">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Thrun</surname> <given-names>S.</given-names></name></person-group> (<year>1998</year>). <article-title>Lifelong learning algorithms</article-title>, in <source>Learning to Learn</source>, eds <person-group person-group-type="editor"><name><surname>Thrun</surname> <given-names>S.</given-names></name> <name><surname>Pratt</surname> <given-names>L.</given-names></name></person-group> (<publisher-loc>Boston, MA</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>181</fpage>&#x02013;<lpage>209</lpage>. <pub-id pub-id-type="doi">10.1007/978-1-4615-5529-2_8</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Todd</surname> <given-names>M. T.</given-names></name> <name><surname>Niv</surname> <given-names>Y.</given-names></name> <name><surname>Cohen</surname> <given-names>J. D.</given-names></name></person-group> (<year>2009</year>). <article-title>Learning to use working memory in partially observable environments through dopaminergic reinforcement</article-title>, in <source>Advances in Neural Information Processing Systems</source>, eds <person-group person-group-type="editor"><name><surname>Koller</surname> <given-names>D.</given-names></name> <name><surname>Schuurmans</surname> <given-names>D.</given-names></name> <name><surname>Bengio</surname> <given-names>Y.</given-names></name> <name><surname>Bottou</surname> <given-names>L.</given-names></name></person-group> (<publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>Curran Associates</publisher-name>), <fpage>1689</fpage>&#x02013;<lpage>1696</lpage>.</citation></ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wingate</surname> <given-names>D.</given-names></name> <name><surname>Singh</surname> <given-names>S.</given-names></name></person-group> (<year>2007</year>). <article-title>On discovery and learning of models with predictive representations of state for agents with continuous actions and observations</article-title>. <source>Proc. Int. Conf. Auton. Agents</source> <volume>5</volume>, <fpage>1136</fpage>&#x02013;<lpage>1143</lpage>. <pub-id pub-id-type="doi">10.1145/1329125.1329352</pub-id></citation></ref>
<ref id="B36">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yates</surname> <given-names>D.</given-names></name> <name><surname>Moore</surname> <given-names>D.</given-names></name> <name><surname>McCabe</surname> <given-names>G.</given-names></name></person-group> (<year>1999</year>). <source>The Practice of Statistics</source>. <publisher-loc>New York, NY</publisher-loc>: <publisher-name>H. Freeman &#x00026; Company</publisher-name>.</citation></ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yun-Long</surname> <given-names>L.</given-names></name> <name><surname>Ren-Hou</surname> <given-names>L.</given-names></name></person-group> (<year>2009</year>). <article-title>Discovery and learning of models with predictive state representations for dynamical systems without reset</article-title>. <source>Knowledge Based Syst</source>. <volume>22</volume>, <fpage>557</fpage>&#x02013;<lpage>561</lpage>. <pub-id pub-id-type="doi">10.1016/j.knosys.2009.01.001</pub-id></citation></ref>
<ref id="B38">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zintgraf</surname> <given-names>L. M.</given-names></name> <name><surname>Shiarlis</surname> <given-names>K.</given-names></name> <name><surname>Kurin</surname> <given-names>V.</given-names></name> <name><surname>Hofmann</surname> <given-names>K.</given-names></name> <name><surname>Whiteson</surname> <given-names>S.</given-names></name></person-group> (<year>2019</year>). <source>Fast Context Adaptation via Meta-Learning</source>. <publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>ICML</publisher-name>.</citation></ref>
</ref-list>
<fn-group>
<fn id="fn0001"><p><sup>1</sup>In our implementations, the scipy function scipy.special.chdtrc is used.</p></fn>
</fn-group>
<fn-group>
<fn fn-type="financial-disclosure"><p><bold>Funding.</bold> This material was based upon work supported by the United States Air Force Research Laboratory (AFRL) and Defense Advanced Research Projects Agency (DARPA) under Contract No. FA8750-18-C-0103. Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the United States Air Force Research Laboratory (AFRL) and Defense Advanced Research Projects Agency (DARPA).</p>
</fn>
</fn-group>
</back>
</article>