<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">819107</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2022.819107</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Robotics and AI</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Learning State-Variable Relationships in POMCP: A Framework for Mobile Robots</article-title>
<alt-title alt-title-type="left-running-head">Zuccotto et al.</alt-title>
<alt-title alt-title-type="right-running-head">Learning State-Variable Relationships in POMCP</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zuccotto</surname>
<given-names>Maddalena</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1385329/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Piccinelli</surname>
<given-names>Marco</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/1550570/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Castellini</surname>
<given-names>Alberto</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1134741/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Marchesini</surname>
<given-names>Enrico</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/1550764/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Farinelli</surname>
<given-names>Alessandro</given-names>
</name>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1897831/overview"/>
</contrib>
</contrib-group>
<aff>
<institution>Department of Computer Science</institution>, <institution>University of Verona</institution>, <addr-line>Verona</addr-line>, <country>Italy</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1299887/overview">Pedro U. Lima</ext-link>, University of Lisbon, Portugal</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/9004/overview">Dimitri Ognibene</ext-link>, University of Milano-Bicocca, Italy</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1159482/overview">Jennifer Renoux</ext-link>, &#xd6;rebro University, Sweden</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Maddalena Zuccotto, <email>maddalena.zuccotto@univr.it</email>; Alberto Castellini, <email>alberto.castellini@univr.it</email>; Alessandro Farinelli, <email>alessandro.farinelli@univr.it</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Computational Intelligence in Robotics, a section of the journal Frontiers in Robotics and AI</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>19</day>
<month>07</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>9</volume>
<elocation-id>819107</elocation-id>
<history>
<date date-type="received">
<day>20</day>
<month>11</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>13</day>
<month>06</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Zuccotto, Piccinelli, Castellini, Marchesini and Farinelli.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Zuccotto, Piccinelli, Castellini, Marchesini and Farinelli</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>We address the problem of learning relationships on state variables in Partially Observable Markov Decision Processes (POMDPs) to improve planning performance. Specifically, we focus on Partially Observable Monte Carlo Planning (POMCP) and represent the acquired knowledge with a Markov Random Field (MRF). We propose, in particular, a method for learning these relationships on a robot as POMCP is used to plan future actions. Then, we present an algorithm that deals with cases in which the MRF is used on episodes having unlikely states with respect to the equality relationships represented by the MRF. Our approach acquires information from the agent&#x2019;s action outcomes to adapt online the MRF if a mismatch is detected between the MRF and the true state. We test this technique on two domains, rocksample, a standard rover exploration task, and a problem of velocity regulation in industrial mobile robotic platforms, showing that the MRF adaptation algorithm improves the planning performance with respect to the standard approach, which does not adapt the MRF online. Finally, a ROS-based architecture is proposed, which allows running the MRF learning, the MRF adaptation, and MRF usage in POMCP on real robotic platforms. In this case, we successfully tested the architecture on a Gazebo simulator of rocksample. A video of the experiments is available in the Supplementary Material, and the code of the ROS-based architecture is available online.</p>
</abstract>
<kwd-group>
<kwd>planning under uncertainty</kwd>
<kwd>POMCP</kwd>
<kwd>POMDP</kwd>
<kwd>prior knowledge</kwd>
<kwd>Markov Random Fields</kwd>
<kwd>learning</kwd>
<kwd>mobile robot planning</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Planning under uncertainty is a problem of sequential decision-making, which has important applications in artificial intelligence and robotics. Over the last 2&#xa0;decades, the interest in this topic has grown rapidly due to methodological improvements and the application of these techniques to real-world domains, such as smart buildings, industrial machinery controllers, and mobile robot navigation. Intelligent and autonomous agents have been, in fact, recently employed in complex domains (e.g., search and rescue, warehouse pick-and-place operations, and mobile robot navigation) where the environment is only partially observable. In such domains, it is hard to have complete knowledge of the environment in which the agent acts. In this work, we tackle a specific problem in the context of planning under uncertainty: the problem of learning probabilistic state-variable relationships. Consider, for instance, a warehouse made of aisles with different traffic levels. A robot has to move in the warehouse to accomplish some tasks. The state of the system contains the robot&#x2019;s position and the configuration of traffic levels in each aisle, but the traffic levels are not known by the robot. It has to discover them using noisy sensors while moving in the warehouse. In this case, each state variable represents the traffic level of an aisle. In the following, state variables refer to the hidden part of the state. Learning state-variable relationships in this context means learning the relationships among the traffic levels of different aisles in the warehouse. The rationale is that if the robot knows that two aisles have the same traffic levels with a high probability, it can improve its planning performance because once it has observed the traffic level of one aisle, it has also acquired some knowledge about the traffic of the other aisle and it can plan considering that knowledge.</p>
<p>Partially Observable Markov Decision Processes (POMDPs) (<xref ref-type="bibr" rid="B44">Sondik, 1978</xref>; <xref ref-type="bibr" rid="B19">Kaelbling et al., 1998</xref>) are a powerful framework for planning under uncertainty. Markov Decision Processes (MDPs) (<xref ref-type="bibr" rid="B36">Russell and Norvig, 2010</xref>) are extended to the case of partially observable environments. To tackle partial observability, they consider all possible states of the (agent-environment) system and assign to each of them a probability value expressing the related likelihood of being the true state. These probabilities, considered as a whole, constitute a probability distribution over states, called belief. A solution for a POMDP is a policy that maps beliefs into actions. The computation of optimal policies is unfeasible in practice (<xref ref-type="bibr" rid="B31">Papadimitriou and Tsitsiklis, 1987</xref>). Therefore, much effort was put into developing approximate (<xref ref-type="bibr" rid="B18">Hauskrecht, 2000</xref>) and online (<xref ref-type="bibr" rid="B35">Ross et al., 2008</xref>) solvers. The most recent approaches mainly rely on the use of point-based value iteration (<xref ref-type="bibr" rid="B45">Spaan and Vlassis, 2004</xref>, <xref ref-type="bibr" rid="B46">2005</xref>; <xref ref-type="bibr" rid="B56">Veiga et al., 2014</xref>) or Monte-Carlo Tree Search (MCTS) based solvers (<xref ref-type="bibr" rid="B22">Kocsis and Szepesv&#xe1;ri, 2006</xref>; <xref ref-type="bibr" rid="B8">Browne et al., 2012</xref>) to deal with large state spaces. Deep Reinforcement Learning (DRL) approaches are instead used to learn policies directly from observations, without using a model of the environment dynamics (<xref ref-type="bibr" rid="B40">Silver et al., 2016</xref>, <xref ref-type="bibr" rid="B41">2017</xref>; <xref ref-type="bibr" rid="B49">Sutton and Barto, 2018</xref>). Planning and reinforcement learning methods have also been used together (<xref ref-type="bibr" rid="B26">Leonetti et al., 2016</xref>) to allow adaptation to the environment and increased reliability. Among the main MCTS-based solvers (<xref ref-type="bibr" rid="B53">Thrun, 2000</xref>; <xref ref-type="bibr" rid="B22">Kocsis and Szepesv&#xe1;ri, 2006</xref>), a meaningful improvement was obtained by Partially Observable Monte Carlo Planning (POMCP) (<xref ref-type="bibr" rid="B42">Silver and Veness, 2010</xref>), a pioneering algorithm that allows applying model-based reinforcement learning to very large state spaces, overcoming the scalability problem that has limited the usage of POMDPs for many years.</p>
<p>We apply the proposed approach for learning state-variable relationships to POMCP. The standard version of this algorithm does not consider any kind of prior knowledge about state-variable relationships. <xref ref-type="bibr" rid="B9">Castellini et al. (2019)</xref> proposed an extension of POMCP, which considers these relationships in the form of Constraint Networks (CNs) or Markov Random Fields (MRFs). In that work, the introduction of such knowledge provides an improvement in terms of planning performance, with no additional overhead in terms of time complexity. However, it is assumed to have full knowledge about the CN or MRF containing the state-variable constraints. This knowledge could be provided, for instance, by experts. Herein, instead, we deal with a methodology for learning this knowledge in the form of an MRF. The literature provides some general approaches for learning MRFs, mainly in the context of computer vision (<xref ref-type="bibr" rid="B57">Vuffray et al., 2020</xref>; <xref ref-type="bibr" rid="B39">Shah et al., 2021</xref>), but they are very general and often time-consuming. On the contrary, our proposed approach is specialized in planning under uncertainty with POMDPs. Hence, it integrates with POMCP without increasing its time complexity. Learning pairwise MRFs, instead of general MRFs, requires a smaller amount of data, which is important in the context of planning, where each learning episode can take a long time. Let us consider, for instance, the case study of the warehouse mentioned above, in which a learning episode could last an entire day of work, having the autonomous robot collect data about traffic levels in the aisles while doing its job.</p>
<p>In this work, we propose three methodological advancements. The first is an algorithm for learning the MRF during the execution of POMCP. The second is implementing a framework to integrate POMCP in ROS, enabling the employment of the MRF learning algorithm on real robotic platforms, with experiments performed on Gazebo simulators of known application domains. The ROS-based architecture allows learning the MRF on real robotic platforms. It comprises three ROS nodes: environment, agent, and planning. The environment node discretizes the real world by exploiting a task-specific representation. The agent node, instead, holds information about odometry and interfaces the ROS-based robotic platform with the environment and the planner. Finally, the planner node runs the learning algorithm. The third advancement is an algorithm called &#x201c;Adapt&#x201d; (see <xref ref-type="statement" rid="algorithm_2">Algorithm 2</xref>), which deals with cases in which we use the learned MRF in episodes with unlikely state-variable configurations with respect to the joint probability defined by MRF. This algorithm runs when the knowledge provided by the learned MRF does not reflect the true state-variable values. In such cases, the MRF is misleading because it forces the belief probabilities toward configurations of state variables that are discordant from the true state, decreasing the probability of the true state. Thus, the proposed algorithm adapts (i.e., changes) the MRF potentials when the agent acquires knowledge about the true state-variable values and detects a mismatch between the information in the learned MRF and the specific state-variable relationships of the episode to fix the mismatch. The adaptation is performed online, as POMCP works, limiting the performance decrease that could derive from the usage of the MRF when the true state-variable configuration represents an unlikely state.</p>
<p>Our empirical analysis shows that the MRF adaptation method improves the performance obtained using the MRF without adaptation. We tested the algorithm on two domains, namely, rocksample (<xref ref-type="bibr" rid="B43">Smith and Simmons, 2004</xref>), a benchmark domain in which an agent moving in a grid has to collect hidden rocks maximizing their values, and velocity regulation (<xref ref-type="bibr" rid="B11">Castellini et al., 2020</xref>, <xref ref-type="bibr" rid="B10">2021</xref>), a domain in which a robot traveling on a predefined path has to regulate its velocity to minimize the time to reach the end and the collisions with obstacles in the path. Results show an average improvement of a discounted reward of 6.54% on rocksample and 3.51% on velocity regulation. Finally, we tested the proposed ROS-based architecture on a Gazebo simulator of rocksample. The architecture enables the generation of informative MRFs that produces statistically significant performance improvements. A video showing the evolution of the learning process performed on the ROS-based architecture and the Gazebo simulator is available in the <xref ref-type="sec" rid="s13">Supplementary Material</xref>. The code of the ROS-based architecture is also available online.<xref ref-type="fn" rid="fn1">
<sup>1</sup>
</xref>
</p>
<p>In summary, the main contributions of this work to state of the art are as follows:<list list-type="simple">
<list-item>
<p>&#x2022; We present a methodology for learning state-variable relationships in the form of an MRF as POMCP is executed on a mobile robot.</p>
</list-item>
<list-item>
<p>&#x2022; We introduce a framework to integrate POMCP within ROS, targeting ROS-based mobile robots. The architecture supports both the phase in which the MRF is learned and the phase in which it is used.</p>
</list-item>
<list-item>
<p>&#x2022; We propose an algorithm for adapting the MRF constraints to episodes having unlikely state-variable configurations as new observations are acquired from the environment.</p>
</list-item>
</list>
</p>
<p>The rest of the study is organized as follows: <xref ref-type="sec" rid="s2">Section 2</xref> discusses related work. <xref ref-type="sec" rid="s3">Section 3</xref> describes the rocksample domain used as a running example. <xref ref-type="sec" rid="s4">Section 4</xref> presents background on POMDP, POMCP, MRF, and the extended POMCP. <xref ref-type="sec" rid="s5">Section 5</xref> formalizes the learning algorithm and the stopping criterion, describes the ROS-based architecture, and formalizes the MRF adaptation method. <xref ref-type="sec" rid="s6">Section 6</xref> presents the empirical evaluation of the three contributions. <xref ref-type="sec" rid="s7">Section 7</xref> draws conclusions and suggests future research directions.</p>
</sec>
<sec id="s2">
<title>2 Related Work</title>
<p>We identified four research topics in the literature related to our work: probabilistic planning under uncertainty, application of POMCP to robotic platforms, Bayesian adaptive learning and other forms of learning for planning, and MRF learning.</p>
<p>Planning under uncertainty is a crucial task for autonomous and intelligent agents. The first works on POMDP-based planning date back to the seventies (<xref ref-type="bibr" rid="B44">Sondik, 1978</xref>). Since then, several methods have been proposed to solve POMDPs (<xref ref-type="bibr" rid="B19">Kaelbling et al., 1998</xref>). Recent works highlight the benefits of introducing prior knowledge in problems formalized as POMDPs and solved by POMCP. <xref ref-type="bibr" rid="B9">Castellini et al. (2019)</xref> showed that the introduction of prior knowledge about state-variable relationships yields performance improvement. In particular, constraints expressed as MRFs (<xref ref-type="bibr" rid="B29">Murphy, 2012</xref>) and CNs (<xref ref-type="bibr" rid="B12">Dechter, 2003</xref>) were used. <xref ref-type="bibr" rid="B10">Castellini et al. (2021)</xref> showed how mobile robots exploited prior knowledge about task similarities to improve their navigation performance in an obstacle avoidance context. The main limitation of these works regards the requirement to have a full specification of the prior knowledge in advance, but this is not always feasible in practice, especially in complex application domains such as robotic ones. What differentiates our work from <xref ref-type="bibr" rid="B10">Castellini et al. (2021</xref>, <xref ref-type="bibr" rid="B9">2019)</xref> is that here we aim to learn the MRF on real robots while acting in the environment and adapt the MRF while it is used. Some other works deal with the problem of adding constraints to planning for improving the performance or scaling to large environments. <xref ref-type="bibr" rid="B25">Lee et al. (2018)</xref>used MCTS to generate policies for constrained POMDPs, and <xref ref-type="bibr" rid="B2">Amato and Oliehoek (2015)</xref> explored the multi-agent structure of some specific problems to decompose the value function. Instead, we constrain the state space on the basis of state-variable relationships to refine the belief during execution. More precisely, we exploit the learned MRF whose potentials express probabilistic constraints between state-variable values. Other related works in the field of planning under uncertainty concern factored POMDPs and their applications (<xref ref-type="bibr" rid="B28">McAllester and Singh, 1999</xref>; <xref ref-type="bibr" rid="B60">Williams and Young, 2007</xref>). However, our approach is substantially different as the performance improvement does not derive from a factorization of the POMDP but from the introduction in POMDP of prior knowledge on the domain, represented as an MRF learned from previously collected data.</p>
<p>Regarding the application of POMCP to robotic platforms, we noticed that the planning algorithm has been recently applied to different robotic problems. <xref ref-type="bibr" rid="B17">Goldhoorn et al. (2014)</xref> proposed two extensions of POMCP to find-and-follow people that work in the continuous space and plan actions in real time. The Adaptive Highest Belief Continuous Real-Time POMCP Follower presented in that paper aimed to avoid unnecessary turns of the robot in reaching the goal. Our method and ROS-based architecture, instead, aim to learn state-variable relationships and use them in POMCP to improve planning performance. <xref ref-type="bibr" rid="B58">Wang et al. (2020)</xref> and <xref ref-type="bibr" rid="B16">Giuliari et al. (2021)</xref> used POMCP in the context of Active Visual Search. The authors proposed a method in which the agent starts acting in an unknown environment (i.e., with no information about the area map). Moreover, they present a new belief reinvigoration approach dealing with dynamically growing state space. <xref ref-type="bibr" rid="B24">Lauri and Ritala (2016)</xref> used POMCP to control a mobile robot to explore a partially known environment. POMCP was previously integrated with ROS in <xref ref-type="bibr" rid="B59">Wertheim et al. (2020)</xref>, where a robotic planning platform called ROS-POMDP was presented. It generated the POMDP model of the problem using Performance Level Profiles (PLP) (<xref ref-type="bibr" rid="B7">Brafman et al., 2016</xref>) and Relational Dynamic Influence Diagram Language (RDDL) (<xref ref-type="bibr" rid="B38">Sanner, 2010</xref>). A two-layer control architecture was instead proposed by <xref ref-type="bibr" rid="B10">Castellini et al. (2021)</xref>, where the upper layer used an extension of POMCP to tune the velocity of a mobile robot and the lower layer used a standard engine controller to deal with path planning. As explained above, our proposed ROS architecture has a completely different goal, integrating the learning of the MRF with POMCP.</p>
<p>As our goal is to learn some information about the environment and introduce it in POMCP to improve its performance, we also analyzed related works on merging learning and planning, with a specific focus on POMDPs and POMCP. Our work is also related, for instance, to Bayesian adaptive learning in POMDPs (<xref ref-type="bibr" rid="B34">Ross et al., 2011</xref>). <xref ref-type="bibr" rid="B21">Katt et al. (2017)</xref> presented an elegant method for learning the transition and reward models. They extended the POMCP algorithm to the Bayes-Adaptive case, proposing the Bayes-Adaptive Partially Observable Monte Carlo Planning (BA-POMCP) approach that, however, learns the parameters of the transition model. Our method, instead, learns probabilities of pairs of state variables to have equal values in the hidden part of single states (i.e., we do not consider any information about how the state changes over time). We assume that the hidden part of the state can change only from one episode to another, and each state has a probability of occurring that depends on some (unknown) state-variable probabilistic relationships. We notice that this setting is very common in practice (see the warehouse example in the introduction), but it cannot be naturally encoded in the transition model. The information encoded in our MRF is instead used to initialize and update the belief. For the same reason, our approach also differentiates from Factored BA-POMDP (<xref ref-type="bibr" rid="B20">Katt et al., 2019</xref>), which learns a compact model of the dynamics by exploiting the underlying structure of a POMDP, allowing for better scale to large problems. Even this approach deals with knowledge about the transition from one state to another across the steps of execution, and it cannot learn the probability distribution of states considering probabilistic state-variable relationships, as our MRF does. We remark that we do not factorize the POMDP to learn the compact model of dynamics. We are interested in learning probabilistic relationships between state-variable values, which is information affecting the belief and its update over time. For instance, the traffic level in two aisles of a warehouse can be highly correlated. Hence, in an episode, the two aisles may have a high traffic level; in another episode, they may have a low traffic level, but the probability that the two aisles have different traffic levels in an episode is low. This prior knowledge about the state of the environment, represented by the initial belief in POMDPs, can be naturally integrated into POMCP using the MRF, a generative model that directly represents state-variable relationships. Using the MRF, we push the belief probabilities toward states that agree with this knowledge. Methodologies for optimally updating POMDP beliefs to reduce uncertainty on the true state have been proposed by <xref ref-type="bibr" rid="B48">Stachniss et al. (2005)</xref>, <xref ref-type="bibr" rid="B3">Araya et al. (2010)</xref>, <xref ref-type="bibr" rid="B55">Veiga (2015)</xref>, <xref ref-type="bibr" rid="B30">Ognibene et al. (2019)</xref>, <xref ref-type="bibr" rid="B14">Fischer and Tas (2020)</xref>, and <xref ref-type="bibr" rid="B52">Thomas et al. (2020)</xref>. However, these methods mainly focus on introducing the belief into the reward function to allow the definition of information gain goals, otherwise not definable, in the context of POMDP. In order to deal with large environments in practical problems, hierarchical models (<xref ref-type="bibr" rid="B15">Friston, 2008</xref>) have been used to extend the POMDP framework (<xref ref-type="bibr" rid="B32">Pineau et al., 2001</xref>; <xref ref-type="bibr" rid="B51">Theocharous et al., 2001</xref>; <xref ref-type="bibr" rid="B50">Theocharous et al., 2004</xref>; <xref ref-type="bibr" rid="B47">Sridharan et al., 2008</xref>; <xref ref-type="bibr" rid="B13">Doshi-Velez, 2009</xref>). These approaches take advantage of the structure of the problem to decompose the state or the action space, introducing different levels of abstraction to learn much larger models. Moreover, in these works, the computation of optimal policies is performed considering only a subset of the models or an action subset because it is intractable to compute optimal policies for the original problem. However, in our approach, we do not decompose the original problem into sub-tasks. We compute policies considering the entire problem domain. Finally, within the research topic of learning for planning in robotic platforms, <xref ref-type="bibr" rid="B4">Atrash and Pineau (2010)</xref> proposed a methodology for learning a model of the user in applications where untrained humans interact and control the robot. In this case, the goal is also to learn a model of the environment.</p>
<p>In the literature, some works proposed approaches to learning arbitrary MRF structures (<xref ref-type="bibr" rid="B5">Besag, 1977</xref>; <xref ref-type="bibr" rid="B1">Abbeel et al., 2006</xref>; <xref ref-type="bibr" rid="B33">Pletscher et al., 2009</xref>; <xref ref-type="bibr" rid="B37">Salakhutdinov, 2009</xref>; <xref ref-type="bibr" rid="B57">Vuffray et al., 2020</xref>) mainly in the field of computer vision. Due to their generality, these approaches have a higher complexity than our proposed approach, which is specialized in pairwise MRF for representing state-variable relationships inside POMDPs. <xref ref-type="bibr" rid="B39">Shah et al. (2021</xref>) also focused on pairwise MRF, but their proposed methodology focused on learning continuous pairwise MRF. The MRFs that we used in our approach are discrete.</p>
</sec>
<sec id="s3">
<title>3 Rocksample: A Domain for a Running Example</title>
<p>As a running example for explaining the main elements of the proposed contributions, in the rest of the study, we consider rocksample (<xref ref-type="bibr" rid="B43">Smith and Simmons, 2004</xref>) a benchmark domain inspired by robotic planetary exploration. In the rocksample, the agent acts in a grid containing valuable and valueless rocks and aims to maximize the value of the collected rocks. The agent does not know the values of the rocks, but it knows only their locations in the grid. Rock values can only be inferred from noisy observations returning the true value of the rock with a probability proportional to the distance between the agent and the observed rock. Knowing in advance the relationships between pairs of rock values (e.g., close rocks could have similar values in real-world applications), the agent can improve its planning performance, collecting more valuable rocks in less time. An example of a probabilistic equality relationship between two state variables <italic>X</italic>
<sub>1</sub> and <italic>X</italic>
<sub>2</sub> assuming values in {0, 1} is &#x201c;<italic>X</italic>
<sub>1</sub> is equal to <italic>X</italic>
<sub>2</sub> with probability 0.9&#x201d;. In the rocksample, this means that rocks 1 and 2 have the same values with high probability. This kind of relationship cannot be encoded in the transition or observation models because it does not deal with the dynamics of the environment or state observability. Instead, it is a property of state distribution and can be represented by the potential of a pairwise MRF in which nodes correspond to state variables and edges to probabilistic relationships between pairs of state-variable values.</p>
</sec>
<sec id="s4">
<title>4 Background</title>
<p>In this section, we provide definitions of POMDP, the model used to formalize our planning problem, POMCP, the planning algorithm used to solve the POMDP, MRF, and the structure used to represent state-variable relationships. Finally, we describe the extension of POMCP that considers prior information.</p>
<sec id="s4-1">
<title>4.1 POMDP</title>
<p>A POMDP (<xref ref-type="bibr" rid="B19">Kaelbling et al., 1998</xref>) is defined as a tuple (<italic>S</italic>, <italic>A</italic>, <italic>O</italic>, <italic>T</italic>, &#x3a9;, <italic>R</italic>, <italic>&#x3b3;</italic>), where <italic>S</italic> is a finite set of <italic>states</italic>, <italic>A</italic> is a finite set of <italic>actions</italic>, &#x3a9; is a finite set of <italic>observations</italic>, <italic>T</italic>: <italic>S</italic> &#xd7; <italic>A</italic> &#x2192; &#x3a0;(<italic>S</italic>) is the <italic>transition</italic> model, where &#x3a0;(<italic>S</italic>) is the space of probability distribution over states, <italic>O</italic>: <italic>S</italic> &#xd7; <italic>A</italic> &#x2192; &#x3a0;(&#x3a9;) is the <italic>observation model</italic>, <inline-formula id="inf1">
<mml:math id="m1">
<mml:mi>R</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:math>
</inline-formula> is the <italic>reward function</italic>, and <italic>&#x3b3;</italic> &#x2208; [0, 1) is the <italic>discount factor</italic>. The agent&#x2019;s goal, as in a MDP (<xref ref-type="bibr" rid="B36">Russell and Norvig, 2010</xref>), is to maximize the <italic>expected discounted return</italic> <inline-formula id="inf2">
<mml:math id="m2">
<mml:mi mathvariant="double-struck">E</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x221e;</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> acting optimally (i.e., choosing, in each state <italic>s</italic>
<sub>
<italic>t</italic>
</sub>, at time <italic>t</italic>, the action <italic>a</italic>
<sub>
<italic>t</italic>
</sub> with the highest expected reward). In the POMDP framework, however, the agent cannot directly observe the current state <italic>s</italic>
<sub>
<italic>t</italic>
</sub>, but it maintains a probability distribution over states <italic>S</italic>, called <italic>belief</italic> which updates at each time step. In the following, we represent by symbol <italic>b</italic>(<italic>s</italic>) the probability of being in state <italic>s</italic> according to belief <italic>b</italic>. The belief summarizes the agent&#x2019;s previous experiences, that is, the sequence of actions and observations that the agent took from an initial belief <italic>b</italic>
<sub>0</sub> to the belief <italic>b</italic>. The sequence of actions and observations is called history (<italic>h</italic>) and is represented as <italic>h</italic> &#x3d; &#x27e8;<italic>a</italic>
<sub>0</sub>, <italic>o</italic>
<sub>0</sub>, &#x2026; , <italic>a</italic>
<sub>
<italic>t</italic>
</sub>, <italic>o</italic>
<sub>
<italic>t</italic>
</sub>&#x27e9;. The solution of a POMDP is an optimal or approximated <italic>policy</italic>, namely, a function that maps belief states into actions, that is, <italic>&#x3c0;</italic>: <italic>B</italic> &#x2192; <italic>A</italic>, where <italic>B</italic> is the belief space. A policy is optimal if it maximizes the expected discounted return. The discount factor <italic>&#x3b3;</italic> guarantees convergence by reducing the weight of long-term rewards.</p>
</sec>
<sec id="s4-2">
<title>4.2 POMCP</title>
<p>POMCP (<xref ref-type="bibr" rid="B42">Silver and Veness, 2010</xref>) is a Monte-Carlo-based algorithm for planning in partially observable environments that combines MCTS (<xref ref-type="bibr" rid="B8">Browne et al., 2012</xref>) to compute an approximated policy with a <italic>particle filter</italic> to represent the belief. The particle filter is initialized with <italic>k</italic> particles, each representing a state <italic>s</italic> and following a uniform distribution if no prior knowledge is available about the initial state. At each step, POMCP uses an MCTS to find the best action to perform. The MCTS is generated by iteratively 1) sampling a state from the particle filter and 2) performing a simulation with that state according to the transition and observation models known by the agent. The Upper Confidence bounds applied to the Trees (UCT) strategy (<xref ref-type="bibr" rid="B22">Kocsis and Szepesv&#xe1;ri, 2006</xref>) is used to balance exploration and exploitation in the simulation phase. The reward of each simulation is backpropagated in the tree to compute the approximated Q-values <italic>Q</italic> (<italic>b</italic>, <italic>a</italic>) for the current belief <italic>b</italic> and, at the end of the process, the action <italic>a</italic> with the higher Q-value is selected. After the selected action, <italic>a</italic> is performed in the real environment, a real observation <italic>o</italic> is collected, and particles in the belief are updated by keeping only particles that explain the observations. Particle reinvigoration is used if no more particles are available in the particle filter.</p>
</sec>
<sec id="s4-3">
<title>4.3 Markov Random Fields</title>
<p>An MRF is an undirected graph where nodes represent variables and edges represent probabilistic relationships between variable values (<xref ref-type="bibr" rid="B6">Bishop, 2006</xref>; <xref ref-type="bibr" rid="B29">Murphy, 2012</xref>). A potential function is a non-negative function of its arguments representing the relative &#x201c;compatibility&#x201d; of different variable assignments. According to the Hammersley&#x2013;Clifford theorem (<xref ref-type="bibr" rid="B54">Upton and Cook, 2008</xref>), the joint probability represented by the MRF can be computed as the product of potential functions over the maximal cliques of the graph, namely,<disp-formula id="e1">
<mml:math id="m3">
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>Z</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:munder>
<mml:mrow>
<mml:mo>&#x220f;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:munder>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:math>
<label>(1)</label>
</disp-formula>where <bold>
<italic>x</italic>
</bold> is a variable configuration (e.g., <bold>
<italic>x</italic>
</bold> &#x3d; (1, 0 &#x2026; , 0)), <bold>
<italic>&#x3b8;</italic>
</bold> is a parametrization of the MRF (i.e., a specific set of values for the parameters <italic>&#x3b8;</italic> that represent the MRF), <italic>C</italic> is the set of maximal cliques, <italic>&#x3c8;</italic>
<sub>
<italic>c</italic>
</sub> (<bold>
<italic>x</italic>
</bold>
<sub>
<italic>c</italic>
</sub>&#x7c;<bold>
<italic>&#x3b8;</italic>
</bold>
<sub>
<italic>c</italic>
</sub>) is the potential function, and <italic>Z</italic>(<bold>
<italic>&#x3b8;</italic>
</bold>) is the partition function, that is, a normalization factor that can be computed as<disp-formula id="e2">
<mml:math id="m4">
<mml:mi>Z</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
</mml:munder>
<mml:munder>
<mml:mrow>
<mml:mo>&#x220f;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:munder>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>Potentials can be represented by a Boltzmann distribution (i.e., exponentials); thus, <italic>&#x3c8;</italic>
<sub>
<italic>c</italic>
</sub> (<bold>
<italic>y</italic>
</bold>
<sub>
<italic>c</italic>
</sub>&#x7c;<bold>
<italic>&#x3b8;</italic>
</bold>
<sub>
<italic>c</italic>
</sub>) &#x3d; exp (&#x2212;<italic>F</italic> (<bold>
<italic>x</italic>
</bold>
<sub>
<italic>c</italic>
</sub>&#x7c;<bold>
<italic>&#x3b8;</italic>
</bold>
<sub>
<italic>c</italic>
</sub>)), where <italic>F</italic> (<bold>
<italic>x</italic>
</bold>
<sub>
<italic>c</italic>
</sub>) is the energy function. Restricting the parametrization of the MRF to the edge rather than to the maximal clique of the graph, we obtain <italic>pairwise MRF</italic>, and, consequently, the product of potentials can be computed by summing the energies of all pairwise relationships. We call <italic>E</italic> the set of pairwise relationships (<italic>i</italic>, <italic>j</italic>) in the MRF, where <italic>i</italic>, <italic>j</italic> &#x2208; 1, <italic>&#x2026;</italic> , <italic>n</italic>, and <italic>n</italic> is the number of state variables. For instance, given a pair of state variables (<italic>X</italic>
<sub>
<italic>i</italic>
</sub>, <italic>X</italic>
<sub>
<italic>j</italic>
</sub>)&#x7c;(<italic>i</italic>, <italic>j</italic>) &#x2208; <italic>E</italic> representing two rocks in rocksample, a potential could be <inline-formula id="inf3">
<mml:math id="m5">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>0,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.45</mml:mn>
</mml:math>
</inline-formula>, which indicates a compatibility of 0.45 to have value 0 in both rocks <italic>X</italic>
<sub>
<italic>i</italic>
</sub> and <italic>X</italic>
<sub>
<italic>j</italic>
</sub>, or <inline-formula id="inf4">
<mml:math id="m6">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.05</mml:mn>
</mml:math>
</inline-formula>, which indicates a compatibility of 0.05 to have value 0 in rock <italic>X</italic>
<sub>
<italic>i</italic>
</sub> and 1 in rock <italic>X</italic>
<sub>
<italic>j</italic>
</sub>. In the following, when we refer to an MRF we mean a set of potentials representing compatibilities of different variable assignments:<disp-formula id="e3">
<mml:math id="m7">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="0.28em"/>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>E</mml:mi>
<mml:mo>,</mml:mo>
<mml:mspace width="0.28em"/>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:math>
<label>(3)</label>
</disp-formula>where <italic>k</italic> is the number of possible values of each variable.</p>
</sec>
<sec id="s4-4">
<title>4.4 Extended POMCP</title>
<p>The methodology we use to introduce prior knowledge in POMCP (<xref ref-type="bibr" rid="B9">Castellini et al., 2019</xref>) allows for defining probabilistic equality relationships among pairs of state variables through MRFs. The use of the MRF allows factorizing the joint probability function of state-variable configurations, and this probability is used to constrain the state space. Indeed, the MRF defines a probability distribution over states of the POMDP. For instance, in the rocksample domain, the state space is the set of all possible rock value configurations, and the constraints introduced by the MRF allow (probabilistically) reducing the possibility of exploring states that have a small probability of being the true state. The integration of prior knowledge in POMCP is mainly developed in the particle filter initialization and in the reinvigoration phase (<xref ref-type="bibr" rid="B9">Castellini et al., 2019</xref>), where the probabilistic constraints stored in the MRF are used to optimize the management of the particle filter representing the agent belief.</p>
<p>To intuitively understand the advantage introduced by the MRF, consider the rocksample environment depicted in <xref ref-type="fig" rid="F1">Figure 1A</xref>, in which the knowledge introduced by the MRF is represented by blue edges between rocks on the grid. The prior knowledge about state-variable relationships is information about equality relationships among the value of different rocks (e.g., with a probability of 0.9, rocks 4 and 5 have the same value). We use this knowledge to &#x201c;push&#x201d; the belief probabilities toward states that agree with this information during particle filter initialization. The rationale is that if the agent knows that two rocks (i.e., two state variables) have the same value with high probability (0.9 in <xref ref-type="fig" rid="F1">Figure 1A</xref>), then it can improve its planning performance because once it has observed the value of one rock, it has also acquired some knowledge about the value of the other rock and it can plan accordingly. In the first row of <xref ref-type="fig" rid="F1">Figure 1B</xref>, we show a hypothetical sequence of action performed by the agent with no knowledge about rock values relationships (i.e., standard POMCP), whereas in the second row, we show a hypothetical sequence of action performed by exploiting such knowledge. In both cases, in step 2, the agent performs a sensing action to check the value of rock 1 (yellow colored). In the hypothesis that the agent observes that rock 1 is valuable (green pentagon in the second column), in the first case (i.e., without MRF), it has no information about rocks 2 and 3, whereas in the second case (i.e., with MRF), the agent also has some information about rocks 2 and 3, which are considered valuable with high probability (green pentagons). In step 10, exploiting the acquired knowledge about rock value relationships, the agent with MRF has already sampled rocks from the three rocks, whereas the agent without any knowledge has only sampled rocks 1 and 2. Hence, the agent with the MRF moves faster. We remark that the knowledge in the MRF does not affect the transition model but only the probability distribution over POMDP states. The knowledge stored in an MRF is used to initialize the particle filter (representing the belief) of POMCP and update the particle filter (i.e., the belief) during reinvigoration, a procedure used by POMCP to introduce new particles upon depletion.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>
<bold>(A)</bold> Example of usage of the MRF in the rocksample environment. The MRF topology is depicted using rocks as nodes, and equality probability constraints are specified on blue edges between rocks. <bold>(B)</bold> Effect of the action performed by the agent at steps 2, 5, and 10 using standard POMCP (first row) and the information stored in the MRF (second row). A yellow-colored rock means that the rock is checked by the agent, whereas a green-colored rock represents a rock observed to be valuable by the agent. We consider rocks 1, 2, and 3 valuable in this example.</p>
</caption>
<graphic xlink:href="frobt-09-819107-g001.tif"/>
</fig>
</sec>
</sec>
<sec id="s5">
<title>5 Methodology</title>
<p>In this section, we present a method for learning the MRF during POMCP execution (<xref ref-type="sec" rid="s5-1">Section 5.1</xref>) that leverages information from the state with the highest probability in the belief and a stopping criterion based on the convergence of MRF potentials (<xref ref-type="sec" rid="s5-2">Section 5.2</xref>). In <xref ref-type="sec" rid="s5-4">Section 5.4</xref>, we describe <italic>MRF Adaptation</italic>; the algorithm that adapts the learned MRF as new knowledge is gathered about the true state-variable configuration and it differs from the information in the MRF. Finally, in <xref ref-type="sec" rid="s5-3">Section 5.3</xref>, we present the ROS-based architecture designed to have POMCP running within ROS and learn the MRF on real mobile robots.</p>
<sec id="s5-1">
<title>5.1 MRF Learning</title>
<p>We present a method to learn the MRF during POMCP execution based on the information provided by the belief. More precisely, it employs information from the state having maximum probability. In all our tests, we assume to learn the MRF in <italic>NE</italic> episodes, where each episode <italic>e</italic> is composed of a fixed number of steps, but the proposed methodology can simply adapt to the case of episodes with a different number of steps. We assume the hidden part of the state to be static in each episode and changing across episodes. We initialize the MRF with uninformative priors and then update it at the end of each episode. Then, at the end of the learning process, we have an MRF which defines probabilistic constraints on hidden state variables. This information allows for better initializing and updating of the state distribution. Details about the proposed methodology and the used data structures are reported in the following.</p>
<sec id="s5-1-1">
<title>5.1.1 Data Structures Used in the Learning Algorithm</title>
<p>Learning the MRF means learning the potentials of pairwise MRF representing state-variable relationships. Given two variables <italic>X</italic>
<sub>
<italic>i</italic>
</sub> and <italic>X</italic>
<sub>
<italic>j</italic>
</sub> with <italic>k</italic> possible values each, we need to learn the potential <inline-formula id="inf5">
<mml:math id="m8">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> for each pair (<italic>l</italic>, <italic>h</italic>) with <italic>l</italic> &#x2208; {1, <italic>&#x2026;</italic> , <italic>k</italic>} and <italic>h</italic> &#x2208; {1, <italic>&#x2026;</italic> , <italic>k</italic>}, where variable equality occurs when <italic>l</italic> &#x3d; <italic>h</italic> and variable inequality occurs in all other cases. To keep track of state-variable values in different episodes, we use three data structures. First, a <italic>vector of state-variable values</italic> <inline-formula id="inf6">
<mml:math id="m9">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> for each episode <italic>e</italic>. <inline-formula id="inf7">
<mml:math id="m10">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is the value of the state-variable <italic>X</italic>
<sub>
<italic>i</italic>
</sub> extracted from the state with maximum likelihood in the final belief of episode <italic>e</italic> (<italic>i</italic> &#x3d; 1, <italic>&#x2026;</italic> , <italic>n</italic> where <italic>n</italic> is the number of state variables). This vector is initialized to <inline-formula id="inf8">
<mml:math id="m11">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, and then each value <inline-formula id="inf9">
<mml:math id="m12">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is updated to the value in {1, &#x2026; , <italic>k</italic>} obtained for variable <italic>X</italic>
<sub>
<italic>i</italic>
</sub> in episode <italic>e</italic>. The second data structure is a four-dimensional array in which we store the <italic>count of equalities and inequalities</italic> among pairs of state variables in each episode <italic>e</italic>, <inline-formula id="inf10">
<mml:math id="m13">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>,</inline-formula> where (<italic>i</italic>, <italic>j</italic>) &#x2208; <italic>E</italic> and <italic>l</italic>, <italic>h</italic>, &#x2208; {1, <italic>&#x2026;</italic> , <italic>k</italic>}. The value <inline-formula id="inf11">
<mml:math id="m14">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is the number of times variable; <italic>X</italic>
<sub>
<italic>i</italic>
</sub> had value <italic>l</italic> and variable <italic>X</italic>
<sub>
<italic>j</italic>
</sub> had value <italic>h</italic> in the previous <italic>e</italic> episodes, where <inline-formula id="inf12">
<mml:math id="m15">
<mml:mi>e</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="double-struck">N</mml:mi>
</mml:math>
</inline-formula>. We update <inline-formula id="inf13">
<mml:math id="m16">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> at the end of each episode <italic>e</italic> using the values in <inline-formula id="inf14">
<mml:math id="m17">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, and the MRF potentials <inline-formula id="inf15">
<mml:math id="m18">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> are directly computed using values in <inline-formula id="inf16">
<mml:math id="m19">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="disp-formula" rid="e5">Eq. 5</xref>). Hence, the MRF can be updated using values in <inline-formula id="inf17">
<mml:math id="m20">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>. The third data structure is a matrix of <italic>probabilities of state-variable equalities</italic>, <inline-formula id="inf18">
<mml:math id="m21">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, where (<italic>i</italic>, <italic>j</italic>) &#x2208; <italic>E</italic>. The value <inline-formula id="inf19">
<mml:math id="m22">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is the probability that state variables <italic>X</italic>
<sub>
<italic>i</italic>
</sub> and <italic>X</italic>
<sub>
<italic>j</italic>
</sub> had equal values until episode <italic>e</italic> (<xref ref-type="disp-formula" rid="e6">Eq. 6</xref>). Notice that the proposed learning algorithm learns both equalities and inequalities&#x2019; probabilistic relationships. Equalities are represented by edges with positive probabilities (e.g., <inline-formula id="inf20">
<mml:math id="m23">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.9</mml:mn>
</mml:math>
</inline-formula> means that rocks <italic>X</italic>
<sub>
<italic>i</italic>
</sub> and <italic>X</italic>
<sub>
<italic>j</italic>
</sub> have a 0.9 probability to have equal values), whereas inequalities are represented by edges with negative probabilities (e.g., <inline-formula id="inf21">
<mml:math id="m24">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.1</mml:mn>
</mml:math>
</inline-formula> means that rocks <italic>X</italic>
<sub>
<italic>i</italic>
</sub> and <italic>X</italic>
<sub>
<italic>j</italic>
</sub> have only 0.1 probability to have equal values, viz, they have a probability 0.9 with different values).</p>
<p>In summary, at each episode <italic>e</italic>, we compute <inline-formula id="inf22">
<mml:math id="m25">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> from <inline-formula id="inf23">
<mml:math id="m26">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, <italic>&#x3c8;</italic> from <inline-formula id="inf24">
<mml:math id="m27">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, and finally <inline-formula id="inf25">
<mml:math id="m28">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> from <italic>&#x3c8;</italic> following the pipeline <inline-formula id="inf26">
<mml:math id="m29">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>. In the next section, we present the proposed learning algorithm and the related strategy for populating <inline-formula id="inf27">
<mml:math id="m30">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> and update <inline-formula id="inf28">
<mml:math id="m31">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s5-1-2">
<title>5.1.2 Learning Algorithm</title>
<p>At each episode <italic>e</italic>, the vector of state-variable values <inline-formula id="inf29">
<mml:math id="m32">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is first populated with the values of state variables <italic>X</italic>
<sub>
<italic>i</italic>
</sub> of the state having maximum likelihood in the agent belief.</p>
<p>
<bold>Update of equality/inequality counts <inline-formula id="inf30">
<mml:math id="m33">
<mml:mi mathvariant="script">M</mml:mi>
</mml:math>
</inline-formula>.</bold> The array of equality/inequality counts <inline-formula id="inf31">
<mml:math id="m34">
<mml:mi mathvariant="script">M</mml:mi>
</mml:math>
</inline-formula> is initialized to <inline-formula id="inf32">
<mml:math id="m35">
<mml:mi mathvariant="script">M</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:math>
</inline-formula>, <italic>&#x2200;</italic>(<italic>i</italic>, <italic>j</italic>) &#x2208; <italic>E</italic>, <italic>&#x2200;l</italic>, <italic>h</italic> &#x2208; {1, &#x2026; <italic>k</italic>}. At the end of each episode, the array <inline-formula id="inf33">
<mml:math id="m36">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> is updated using vector <inline-formula id="inf34">
<mml:math id="m37">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> as<disp-formula id="e4">
<mml:math id="m38">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>&#x2009;if&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>&#x2227;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>h</mml:mi>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>&#x2009;otherwise</mml:mtext>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>
<bold>Computation of potentials <italic>&#x3c8;</italic> from counts <inline-formula id="inf35">
<mml:math id="m39">
<mml:mi mathvariant="script">M</mml:mi>
</mml:math>
</inline-formula>.</bold> We compute MRF potentials <italic>&#x3c8;</italic> from multi-dimensional array <inline-formula id="inf36">
<mml:math id="m40">
<mml:mi mathvariant="script">M</mml:mi>
</mml:math>
</inline-formula> at each episode <italic>e</italic> by normalizing each cell using the following formula:<disp-formula id="e5">
<mml:math id="m41">
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>w</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:math>
<label>(5)</label>
</disp-formula>where (<italic>i</italic>, <italic>j</italic>) &#x2208; <italic>E</italic>. Namely, we consider only pairs of nodes connected by an edge. For instance, given a pair of state variables (<italic>X</italic>
<sub>
<italic>i</italic>
</sub>, <italic>X</italic>
<sub>
<italic>j</italic>
</sub>)&#x7c;(<italic>i</italic>, <italic>j</italic>) &#x2208; <italic>E</italic> assuming values in {0, 1}, the potential <inline-formula id="inf37">
<mml:math id="m42">
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>0,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.6</mml:mn>
</mml:math>
</inline-formula> corresponds to the ratio between the number of times <italic>X</italic>
<sub>
<italic>i</italic>
</sub> &#x3d; <italic>X</italic>
<sub>
<italic>j</italic>
</sub> &#x3d; 0 and the number of times each possible assignment for <italic>X</italic>
<sub>
<italic>i</italic>
</sub> and <italic>X</italic>
<sub>
<italic>j</italic>
</sub> has been observed. Namely, given <inline-formula id="inf38">
<mml:math id="m43">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>0,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>6</mml:mn>
</mml:math>
</inline-formula>, <inline-formula id="inf39">
<mml:math id="m44">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:math>
</inline-formula>, <inline-formula id="inf40">
<mml:math id="m45">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>1,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:math>
</inline-formula>, and <inline-formula id="inf41">
<mml:math id="m46">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>1,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:math>
</inline-formula>, we compute <inline-formula id="inf42">
<mml:math id="m47">
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>0,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>6</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>6</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.6</mml:mn>
</mml:math>
</inline-formula>.</p>
<p>
<bold>Computation of probabilities of state-variable equalities <inline-formula id="inf43">
<mml:math id="m48">
<mml:mi mathvariant="script">P</mml:mi>
</mml:math>
</inline-formula> from <italic>&#x3c8;</italic>.</bold> These probabilities are finally computed for each (<italic>i</italic>, <italic>j</italic>) &#x2208; <italic>E</italic>:<disp-formula id="e6">
<mml:math id="m49">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:munderover accentunder="false" accent="false">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>In other words, <inline-formula id="inf44">
<mml:math id="m50">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is the sum of potentials corresponding to equal values of variables <italic>X</italic>
<sub>
<italic>i</italic>
</sub> and <italic>X</italic>
<sub>
<italic>j</italic>
</sub>. For instance, given the pair of state variables (<italic>X</italic>
<sub>
<italic>i</italic>
</sub>, <italic>X</italic>
<sub>
<italic>j</italic>
</sub>) and the potentials <inline-formula id="inf45">
<mml:math id="m51">
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>0,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.6</mml:mn>
</mml:math>
</inline-formula>, <inline-formula id="inf46">
<mml:math id="m52">
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.1</mml:mn>
</mml:math>
</inline-formula>, <inline-formula id="inf47">
<mml:math id="m53">
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1,0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.1</mml:mn>
</mml:math>
</inline-formula>, and <inline-formula id="inf48">
<mml:math id="m54">
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c8;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.2</mml:mn>
</mml:math>
</inline-formula>, we compute <inline-formula id="inf49">
<mml:math id="m55">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.8</mml:mn>
</mml:math>
</inline-formula>.</p>
</sec>
</sec>
<sec id="s5-2">
<title>5.2 Stopping Criterion</title>
<p>At the end of each learning episode, the MRF is updated, considering the information about the state-variable relationships acquired in the episode. The question we answer in this section is, &#x201c;when can the learning process be stopped?&#x201d;. The MRF must provide meaningful knowledge about state-variable relationships to improve planning performance. The methodology we propose analyzes the equality probabilities in the MRF and stops the learning phase when these probabilities converge, namely, when their values have little changes for a few consecutive episodes. More precisely, at the end of each episode <italic>e</italic>, we check if each equality probability <inline-formula id="inf50">
<mml:math id="m56">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>E</mml:mi>
</mml:math>
</inline-formula>, differs less than a threshold <italic>&#x3b7;</italic> from the same equality probability at the end of the previous episode <italic>e</italic> &#x2212; 1. If this condition is satisfied for <italic>ce</italic> consecutive episodes, then we stop the MRF learning process. <xref ref-type="statement" rid="algorithm_1">Algorithm 1</xref> formalizes the approach. It receives the matrices of equality probabilities at episodes <italic>e</italic> and <italic>e</italic> &#x2212; 1, namely, <inline-formula id="inf51">
<mml:math id="m57">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf52">
<mml:math id="m58">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, the convergence threshold <italic>&#x3b7;</italic>, the threshold <italic>ce</italic> on the number of consecutive episodes, and the number <italic>ct</italic> of consecutive episodes that satisfied the condition on the convergence threshold until the current episode <italic>e</italic>. It returns the stop learning flag <italic>stop</italic> and the updated number of consecutive episodes that satisfy the convergence condition <inline-formula id="inf53">
<mml:math id="m59">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>. The value of <italic>stop</italic> is true if, for every edge, the difference between the value at episode <italic>e</italic> and episode <italic>e</italic> &#x2212; 1 is below the threshold <italic>&#x3b7;</italic> (line 3) for at least three consecutive episodes (line 9), false otherwise. The value of <inline-formula id="inf54">
<mml:math id="m60">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> is used for checking the stopping condition at the next episode.</p>
<p>
<statement content-type="algorithm" id="algorithm_1">
<label>Algorithm 1</label>
<p>Stopping criterion.</p>
<p>
<inline-graphic xlink:href="frobt-09-819107-fx1.tif"/>
</p>
</statement>
</p>
</sec>
<sec id="s5-3">
<title>5.3 ROS Architecture for POMCP</title>
<p>We developed a light and straightforward framework that integrates POMCP with ROS, targeting mobile robots. The architecture can also be exploited to execute the MRF learning algorithm and subsequently run the extended POMCP that leverages the constraints in the learned MRF. Optionally, the extended POMCP can be run with MRF adaptation. The architecture can be used with all mobile robotic platforms supporting the ROS Navigation Stack (<xref ref-type="bibr" rid="B27">Marder-Eppstein et al., 2010</xref>) and with POMDPs defined following the original POMCP implementation. Additionally, it can be executed in simulation using Gazebo. Since the architecture relies on the ROS network to communicate, the POMCP algorithm is not directly run on the machine mounted on the robotic platform but on an external one, which has more computational power. This results in faster execution of POMCP, with lower power consumption for the mobile robot. The structure of the architecture is illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>. It contains three main components, namely, the <italic>environment</italic>, the <italic>planner</italic>, and the <italic>agent</italic>, all implemented in C&#x2b;&#x2b;. In the following paragraphs, each component is described in detail.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>ROS architecture for running POMCP on mobile robotic platforms. The three main components are identified by the colored boxes and connected to the same ROS network. The planner supports standard POMCP, extended POMCP, and our proposed approach with MRF adaptation, besides the MRF learning algorithm.</p>
</caption>
<graphic xlink:href="frobt-09-819107-g002.tif"/>
</fig>
<sec id="s5-3-1">
<title>5.3.1 Environment</title>
<p>The environment is a discretization of the real world that exploits a task-specific representation, such as a grid for the rocksample domain.</p>
</sec>
<sec id="s5-3-2">
<title>5.3.2 Planner</title>
<p>The role of the planner is manifold. First, it runs POMCP, from the standard to the extended version. Second, it manages the whole learning process, handling the learning algorithm and keeping track of learned relations to eventually trigger the stopping criterion. When performing the MRF learning process, the node keeps track of the belief, after which each action is performed by the agent. Then, at the end of each episode, the MRF is updated accordingly.</p>
<p>The planner communicates with the ROS network during the <italic>Step</italic> function call, hence when applying the transition and the observation model of the POMPD. Right after producing the best-desired action, a command is dispatched to the agent, and the planner pauses until the agent feeds back the result.</p>
</sec>
<sec id="s5-3-3">
<title>5.3.3 Agent</title>
<p>The agent node is the interface with the robotic platform. It holds information about the robot&#x2019;s position through odometry and is responsible for moving the mobile platform to the desired position whenever the planner produces a goal command, which corresponds to a 3D pose in the environment. This is done by exploiting the ROS Navigation Stack, which takes the pose as input and gives a series of target velocities as output. On the contrary, if the planner produces a sensing action, the agent will directly interfere with the environment or sensors mounted on the robotic platform.</p>
</sec>
</sec>
<sec id="s5-4">
<title>5.4 MRF Adaptation</title>
<p>The MRF is learned on several episodes and contains probabilistic information about state-variable relationships. For instance, a probability of 0.9 between state variables <italic>X</italic>
<sub>1</sub> and <italic>X</italic>
<sub>2</sub>, that is, <inline-formula id="inf55">
<mml:math id="m61">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1,2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.9</mml:mn>
</mml:math>
</inline-formula>, in the rocksample domain means that, in 90% of the learning episodes, the most probable state-variable configuration had equal values in rocks <italic>X</italic>
<sub>1</sub> and <italic>X</italic>
<sub>2</sub>. When the MRF is used in a new episode, however, the values of the rocks in that specific episode can be equal to or different from each other (e.g., in a specific episode <italic>X</italic>
<sub>1</sub> could be valuable and <italic>X</italic>
<sub>2</sub> valueless, although this configuration has only probability 0.1 to occur). The MRF is used in POMCP to &#x201c;push&#x201d; the belief probabilities toward states that agree with the joint probability it represents. In other words, using the constraints among state-variable values introduced by the MRF, we probabilistically reduce the possibility of having in the particle filter a large number of particles corresponding to states with a small probability of being the true state. At each episode, we initialize the belief leveraging the information present in the MRF, peaking the probability distribution on states that reflect the equality relationships expressed in the MRF. In our example, the states with the same value of <italic>X</italic>
<sub>1</sub> and <italic>X</italic>
<sub>2</sub> will be initialized with a higher probability. This is beneficial if the values in the true state of the current episode are actually equal (which happens with a probability of 0.9 in our example) and harmful if the values in the true state of the current episode are actually different from each other (which happens with a probability of 0.1). In this second case, the belief is peaked in the wrong states and even several observations could be not enough to &#x201c;correct&#x201d; the probability distribution over states, leading to a performance decrease with respect to the standard POMCP. Thus, the idea of the algorithm presented in this section is to adapt the probabilities in the MRF during the usage of the MRF as new evidence is gathered about the true values of the state variables in the specific episode and there is a mismatch (i.e., <italic>discrepancy</italic>) between these true values and the information in the MRF. Then, the adapted MRF is used to re-initialize the belief to change the agent strategy. Let us consider, for instance, an episode of rocksample in which rock <italic>X</italic>
<sub>1</sub> is valuable and rock <italic>X</italic>
<sub>2</sub> is valueless. When we use the MRF, the states with different rock values are penalized, but if the agent collects the rocks, then their true values are available. Hence, we can detect the discrepancy between the MRF probabilities <inline-formula id="inf56">
<mml:math id="m62">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1,2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.9</mml:mn>
</mml:math>
</inline-formula> and the true rock values and update the MRF probabilities accordingly to avoid penalizing good states in the following steps of the same episode. This is the idea behind the MRF adaptation algorithm formalized in <xref ref-type="statement" rid="algorithm_2">Algorithm 2</xref> and explained in detail in the following: notice that, given an episode <italic>e</italic>, the adaptation of the MRF has effect only in the steps after a discrepancy is detected in that episode. However, the learned MRF is restored in the next episode <italic>e</italic> &#x2b; 1 because each episode is characterized by a different true state. We remark that the proposed algorithm does not learn a new MRF, as it adapts the information stored in the learned MRF when a discrepancy is detected during an episode, and it uses the adapted MRF to re-initialize the particle filter. At the beginning of the subsequent episode, we restore the learned MRF (with no adaptation) and use it to initialize the particle filter. On the contrary, during the learning process, we update the MRF leveraging the information given by the state with the highest probability in the belief, and we do not introduce the MRF in POMCP.</p>
<p>
<statement content-type="algorithm" id="algorithm_2">
<label>Algorithm 2</label>
<p>MRF adaptation algorithm.</p>
<p>
<inline-graphic xlink:href="frobt-09-819107-fx2.tif"/>
</p>
<p>The inputs of the main function of the algorithm (Function Adapt) are as follows: the step <italic>q</italic> of the episode, the MRF <inline-formula id="inf57">
<mml:math id="m63">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>n</mml:mi>
</mml:math>
</inline-formula> updated until the current step of the current episode; the index <italic>i</italic> of the state-variable of which we have observed the true value in the current step; the vector <italic>TV</italic> [<italic>i</italic>] of state variables observed until the current step, where <italic>TV</italic> [<italic>i</italic>] &#x3d; <italic>na</italic> if the true value of the variable has not been observed and <italic>TV</italic> [<italic>i</italic>] &#x3d; <italic>v</italic>
<sub>
<italic>i</italic>
</sub> if the true value of the variable has been observed; and &#x27e8;<italic>a</italic>
<sub>0</sub>, <italic>o</italic>
<sub>0</sub>, &#x2026; , <italic>a</italic>
<sub>
<italic>q</italic>
</sub>, <italic>o</italic>
<sub>
<italic>q</italic>
</sub>&#x27e9; the sequence of actions and observations (history) obtained up to the current execution step <italic>q</italic>. The output is the adapted MRF and the new belief <italic>b</italic>&#x2032; (returned by Function Belief_recomputation) if a discrepancy has been detected. Otherwise, the Function Adapt ends returning the received MRF and an empty belief to notify that no discrepancies have been detected.</p>
<p>Every time the true value of a state-variable <italic>X</italic>
<sub>
<italic>i</italic>
</sub> is gathered, the algorithm checks if <italic>X</italic>
<sub>
<italic>i</italic>
</sub> is connected to another state variable <italic>X</italic>
<sub>
<italic>j</italic>
</sub> by an edge in the MRF and if both variables have been observed (line 6). In this case, the algorithm checks the value of <inline-formula id="inf58">
<mml:math id="m64">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and the values of the observed variables <italic>v</italic>
<sub>
<italic>i</italic>
</sub> and <italic>v</italic>
<sub>
<italic>j</italic>
</sub> to detect a discrepancy (line 7). In particular, a discrepancy occurs if the equality probability in the MRF is discordant with the true values of <italic>X</italic>
<sub>
<italic>i</italic>
</sub> and <italic>X</italic>
<sub>
<italic>j</italic>
</sub>. In such a case, the MRF must be updated. If <inline-formula id="inf59">
<mml:math id="m65">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:math>
</inline-formula> and <italic>v</italic>
<sub>
<italic>i</italic>
</sub> &#x2260; <italic>v</italic>
<sub>
<italic>j</italic>
</sub>, then <inline-formula id="inf60">
<mml:math id="m66">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is set to 0 (see line 10). This is because we are sure that the two variables have different values in the current episode, and the MRF is updated accordingly. If <inline-formula id="inf61">
<mml:math id="m67">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:math>
</inline-formula> and <italic>v</italic>
<sub>
<italic>i</italic>
</sub> &#x3d; <italic>v</italic>
<sub>
<italic>j</italic>
</sub>, then the algorithm sets <inline-formula id="inf62">
<mml:math id="m68">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> to 1 (see line 12). In this case, we are sure that the two variables have the same values, and the MRF is again updated accordingly. In the example above, if rock <italic>x</italic>
<sub>1</sub> is valuable and rock <italic>X</italic>
<sub>2</sub> is valueless but <inline-formula id="inf63">
<mml:math id="m69">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1,2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.9</mml:mn>
</mml:math>
</inline-formula>, then this probability is set to <inline-formula id="inf64">
<mml:math id="m70">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1,2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:math>
</inline-formula> in the adapted MRF.</p>
<p>If a discrepancy has been detected and the MRF updated, the current belief at step <italic>q</italic> must also be updated considering the new specific knowledge acquired on the current episode. Function <italic>Belief_recomputation</italic> performs this task. Its inputs are the step <italic>q</italic> of the episode, the adapted MRF <inline-formula id="inf65">
<mml:math id="m71">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, and the history of actions and observations &#x27e8;<italic>a</italic>
<sub>0</sub>, <italic>o</italic>
<sub>0</sub>, &#x2026; , <italic>a</italic>
<sub>
<italic>q</italic>
</sub>, <italic>o</italic>
<sub>
<italic>q</italic>
</sub>&#x27e9;, and its output is the updated belief <italic>b</italic>. The new belief <italic>b</italic> is first initialized (line 21), sampling <italic>NP</italic> states according to the distribution defined by the adapted MRF (we set <italic>NP</italic> to the number of POMCP simulations), and then updated using POMCP belief update following the current history &#x27e8;<italic>a</italic>
<sub>1</sub>, <italic>o</italic>
<sub>1</sub>, &#x2026; <italic>a</italic>
<sub>
<italic>q</italic>
</sub>, <italic>o</italic>
<sub>
<italic>q</italic>
</sub>&#x27e9; (see line 25). <xref ref-type="bibr" rid="B42">Silver and Veness (2010)</xref> used a simulator <inline-formula id="inf66">
<mml:math id="m72">
<mml:mi mathvariant="script">G</mml:mi>
</mml:math>
</inline-formula> as a generative model of the POMDP. The updated belief <italic>b</italic> is used in the next step instead of the current belief.</p>
</statement>
</p>
</sec>
</sec>
<sec id="s6">
<title>6 Experiments</title>
<p>In this section, we present the results of our empirical analysis. We perform three different tests on two application domains described in <xref ref-type="sec" rid="s6-1">Section 6.1</xref>. <xref ref-type="sec" rid="s6-2">Section 6.2</xref> defines the measure used to evaluate the performance. Then, we present the results of our test following the order by which we introduced the methodological contributions. First, in <xref ref-type="sec" rid="s6-3">Section 6.3</xref>, we analyze the performance of the proposed learning algorithm on a <italic>C&#x2b;&#x2b;</italic> simulator of the rocksample environment. The empirical analysis shows the average performance improvement achieved using the learned MRF in the extended POMCP against standard POMCP (in the following, we refer to them as <italic>EXT</italic> and <italic>STD</italic>, respectively). Second, in <xref ref-type="sec" rid="s6-4">Section 6.4</xref>, we show the evaluation of the ROS-based architecture for learning and using the learned MRF. The empirical analysis shows the average performance improvement achieved when the MRF learned on the robotic platform is used in EXT on the same platform. A video is also presented, which shows a complete learning process performed on the Gazebo simulator of rocksample. Third, in <xref ref-type="sec" rid="s6-5">Section 6.5</xref>, we describe the experiments performed to evaluate the MRF adaptation algorithm. The performance of POMCP with MRF adaptation (<italic>ADA</italic>, in the following) is compared with that of EXT.</p>
<sec id="s6-1">
<title>6.1 Domains</title>
<p>We provide full details on the two application domains used in our tests, namely, rocksample (<xref ref-type="bibr" rid="B43">Smith and Simmons, 2004</xref>) and velocity regulation (<xref ref-type="bibr" rid="B11">Castellini et al., 2020</xref>, <xref ref-type="bibr" rid="B10">2021</xref>).</p>
<sec id="s6-1-1">
<title>6.1.1 Rocksample</title>
<p>In the rocksample domain (<xref ref-type="bibr" rid="B43">Smith and Simmons, 2004</xref>), an agent moves through a grid containing valuable and valueless rocks placed in a fixed position to maximize the discounted reward collecting rock values. We perform our tests on rocksample (5,8), consisting of a 5 &#xd7; 5 grid in which we pose eight rocks (<xref ref-type="fig" rid="F3">Figure 3A</xref>). The rock value configuration changes at each episode and is decided <italic>a priori</italic> to reflect specific constraints. Notation (<italic>i</italic>, <italic>j</italic>) identifies the cell in column <italic>i</italic> and row <italic>j</italic> on the grid, whereas for rocks, we use indices from 1 to 8. The agent (light blue circle in <xref ref-type="fig" rid="F3">Figure 3A</xref>) knows the rock locations, but it cannot observe rock values (which is the hidden part of the state). These values can only be inferred using observations returned by the environment. The correct result of rock observations, however, is inversely proportional to the distance between the agent position and the rock. At each step, the agent performs one action among <italic>moving</italic> (up, down, left, right), <italic>sensing</italic> a rock (i.e., checking its value), or <italic>sampling</italic> a rock (i.e., collecting its value). The reward obtained by moving and sensing is 0, whereas sampling a rock gives a reward of 10 if the rock is valuable and &#x2212;10 if it is valueless. <xref ref-type="fig" rid="F3">Figure 3B</xref> shows the true MRF we used to constrain rock values. It presents five edges with the following probability values: <inline-formula id="inf67">
<mml:math id="m73">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1,2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.90</mml:mn>
</mml:math>
</inline-formula>, <inline-formula id="inf68">
<mml:math id="m74">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>2,3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.91</mml:mn>
</mml:math>
</inline-formula>, <inline-formula id="inf69">
<mml:math id="m75">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3,4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.92</mml:mn>
</mml:math>
</inline-formula>, <inline-formula id="inf70">
<mml:math id="m76">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>4,5</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.91</mml:mn>
</mml:math>
</inline-formula>, and <inline-formula id="inf71">
<mml:math id="m77">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>5,6</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.91</mml:mn>
</mml:math>
</inline-formula>. Thus, admissible configurations of rock values have, with high probability, all these rocks with the same value, whereas the values of rocks 7 and 8 can be randomly assigned because there are no constraints on their values in the MRF.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>
<bold>(A)</bold> Instance of rocksample environment (<xref ref-type="bibr" rid="B61">Zuccotto et al., 2022</xref>). <bold>(B)</bold> True MRF topology used for the domain.</p>
</caption>
<graphic xlink:href="frobt-09-819107-g003.tif"/>
</fig>
<p>This problem can be formalized as a POMDP. The <italic>state</italic> is characterized by 1) the agent position on the grid, 2) the rocks&#x2019; configuration (hidden), and 3) a flag indicating rocks already sampled. The set of <italic>actions</italic> is composed of the four moving actions, the sample action, and a sensing action for each rock. <italic>Observations</italic> have three possible values: 1 for valuable and 2 for valueless rock observation returned by sensing actions and 3 for null observations returned by moving actions. The discount factor used is <italic>&#x3b3;</italic> &#x3d; 0.95. We aim to maximize the information learned about state-variable relationships, so we prevent the agent from exiting the grid.</p>
</sec>
<sec id="s6-1-2">
<title>6.1.2 Velocity Regulation</title>
<p>In the velocity regulation problem (<xref ref-type="bibr" rid="B11">Castellini et al., 2020</xref>, <xref ref-type="bibr" rid="B10">2021</xref>), a mobile robot traverses a pre-defined path (<xref ref-type="fig" rid="F4">Figure 4A</xref>) divided into segments <italic>g</italic>
<sub>
<italic>i</italic>
</sub> and subsegments <italic>g</italic>
<sub>
<italic>i</italic>,<italic>j</italic>
</sub>. Notation (<italic>i</italic>, <italic>j</italic>) identifies the position of the robot in the path, where <italic>i</italic> is the index of the segment and <italic>j</italic> the index of the subsegment. More precisely, with (<italic>i</italic>, <italic>j</italic>), we mean that the agent is at the beginning of subsegment <italic>g</italic>
<sub>
<italic>i</italic>,<italic>j</italic>
</sub>. Each segment is characterized by a difficulty <italic>f</italic>
<sub>
<italic>i</italic>
</sub> that depends on the obstacle density in the segment. The robot has to traverse the entire path in the shortest possible time, tuning its speed <italic>v</italic> to avoid collisions with obstacles. Each time the robot collides, a time penalty is given. The robot does not know in advance the real difficulty of the segments (which is the hidden part of the state), and it can only infer their values from the readings of a sensor (<xref ref-type="fig" rid="F4">Figure 4A</xref>). <xref ref-type="fig" rid="F4">Figure 4B</xref> shows the true MRF that we used to constrain segment difficulties. It presents five edges with the following probability values: <inline-formula id="inf72">
<mml:math id="m78">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1,2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.90</mml:mn>
</mml:math>
</inline-formula>, <inline-formula id="inf73">
<mml:math id="m79">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>2,3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.91</mml:mn>
</mml:math>
</inline-formula>, <inline-formula id="inf74">
<mml:math id="m80">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3,4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.92</mml:mn>
</mml:math>
</inline-formula>, <inline-formula id="inf75">
<mml:math id="m81">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>4,5</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.91</mml:mn>
</mml:math>
</inline-formula>, and <inline-formula id="inf76">
<mml:math id="m82">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>5,6</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.91</mml:mn>
</mml:math>
</inline-formula>. Thus, admissible configurations of segment difficulties have, with high probability, all these segments with the same value, whereas the values of segments 7 and 8 can be randomly assigned as there are no constraints on their values in the MRF.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>
<bold>(A)</bold> Instance of velocity regulation. <bold>(B)</bold> True MRF topology used for the domain.</p>
</caption>
<graphic xlink:href="frobt-09-819107-g004.tif"/>
</fig>
<p>This problem can be formalized as a POMDP. The <italic>state</italic> is characterized by 1) the position of the robot in the path; 2) the (hidden) true configuration of segment difficulties (<italic>f</italic>
<sub>1</sub>, <italic>&#x2026;</italic> , <italic>f</italic>
<sub>
<italic>m</italic>
</sub>), where <italic>f</italic>
<sub>
<italic>j</italic>
</sub> &#x2208; {<italic>L</italic>, <italic>M</italic>, <italic>H</italic>}, <italic>L</italic> represents low difficulty, <italic>M</italic> medium difficulty, <italic>H</italic> high difficulty; 3) <italic>t</italic> is the time elapsed from the beginning of the path. The set of <italic>actions</italic> is composed of the three possible speed values of the robot in a subsegment: slow (<italic>S</italic>), intermediate (<italic>I</italic>), or fast (<italic>F</italic>). <italic>Observations</italic> are related to subsegment occupancy and robot angular velocity. The <italic>occupancy model</italic> <italic>p</italic> (<italic>oc</italic>&#x7c;<italic>f</italic>) probabilistically relates segment difficulties to subsegment occupancy. <italic>oc</italic> &#x3d; 0 means that no obstacles are detected in the next subsegment. On the contrary, <italic>oc</italic> &#x3d; 1 means that some obstacles are detected. The <italic>angular velocity model</italic>, instead, provides the probability of angular velocity given segment difficulties, namely, <italic>p</italic> (<italic>av</italic>&#x7c;<italic>f</italic>). More precisely, <italic>av</italic> &#x3d; 0 means that the robot performs a few curves in the subsegment, whereas <italic>av</italic> &#x3d; 1 means it performs several curves. In a realistic application on a mobile robot, <italic>oc</italic> is computed by averaging the values of the laser in front of the robot and applying a threshold to obtain the two binary values. Moreover, we count the actions corresponding to robot turns with angular velocity <inline-formula id="inf77">
<mml:math id="m83">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>45</mml:mn>
</mml:math>
</inline-formula>&#xb0;/s, and threshold such count to obtain the binary signal for <italic>av</italic>. The final observation is a coding of both variables <italic>oc</italic> and <italic>av</italic> computed as <italic>o</italic> &#x3d; <italic>av</italic> &#x2b; 2 &#x22c5; <italic>oc</italic>. Namely, <italic>o</italic> &#x3d; 0 if <italic>av</italic> &#x3d; 0 and <italic>oc</italic> &#x3d; 0; <italic>o</italic> &#x3d; 1 if <italic>av</italic> &#x3d; 1 and <italic>oc</italic> &#x3d; 0; <italic>o</italic> &#x3d; 2 if <italic>av</italic> &#x3d; 0 and <italic>oc</italic> &#x3d; 1; and <italic>o</italic> &#x3d; 3 if <italic>av</italic> &#x3d; 1 and <italic>oc</italic> &#x3d; 1. The observation model provides the probability of observations given segment difficulties, namely, <italic>p</italic> (<italic>o</italic>&#x7c;<italic>f</italic>). We refer to the original work on the velocity regulation problem for more details about specific parameters (<xref ref-type="bibr" rid="B10">Castellini et al., 2021</xref>).</p>
<p>The time required to traverse a subsegment depends on the action that the agent performs and the time penalty it receives. Namely, the agent needs one time unit if the action is <italic>F</italic> (fast speed), two time units if the action is <italic>I</italic>, and three time units if the action is <italic>S</italic>. The <italic>collision model</italic> <italic>p</italic> (<italic>c</italic>&#x7c;<italic>f</italic>, <italic>a</italic>) regulates the collision probability; more precisely, <italic>c</italic> &#x3d; 0 means no collision and <italic>c</italic> &#x3d; 1 means a collision occurs. The reward function here is <italic>R</italic> &#x3d; &#x2212;(<italic>t</italic>
<sub>1</sub> &#x2b; <italic>t</italic>
<sub>2</sub>), where <italic>t</italic>
<sub>1</sub> is the time depending on the agent&#x2019;s action and <italic>t</italic>
<sub>2</sub> is the penalty due to collisions (in our tests <italic>t</italic>
<sub>2</sub> &#x3d; 10). Finally, the discount factor we used is <italic>&#x3b3;</italic> &#x3d; 0.95. The parameters used in our tests are summarized in <xref ref-type="table" rid="T3">Tables 1</xref>&#x2013;<xref ref-type="table" rid="T3">3</xref>.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Main elements of the POMDP model for the collision avoidance problem. Occupancy model <italic>p</italic> (<italic>o</italic>&#x7c;<italic>f</italic>): probability of subsegment occupancy given segment difficulty.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">
<italic>f</italic>
</th>
<th align="center">
<italic>p</italic> (<italic>oc</italic> &#x3d; 1 &#x7c; <italic>f</italic>)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">L</td>
<td align="char" char=".">0.600</td>
</tr>
<tr>
<td align="left">M</td>
<td align="char" char=".">0.690</td>
</tr>
<tr>
<td align="left">H</td>
<td align="char" char=".">0.940</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Main elements of the POMDP model for the collision avoidance problem. Angular velocity model <italic>p</italic> (<italic>av</italic>&#x7c;<italic>f</italic>).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">
<italic>f</italic>
</th>
<th align="center">
<italic>p</italic> (<italic>av</italic> &#x3d; 1 &#x7c; <italic>f</italic>)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">L</td>
<td align="char" char=".">0.170</td>
</tr>
<tr>
<td align="left">M</td>
<td align="char" char=".">0.240</td>
</tr>
<tr>
<td align="left">H</td>
<td align="char" char=".">0.530</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Main elements of the POMDP model for the collision avoidance problem. Collision model <italic>p</italic> (<italic>c</italic>&#x7c;<italic>f</italic>, <italic>a</italic>): collision probability given segment difficulty and action.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">
<italic>f</italic>
</th>
<th align="center">
<italic>a</italic>
</th>
<th align="center">
<italic>p</italic> (<italic>c</italic> &#x3d; 1 &#x7c; <italic>f</italic>, <italic>a</italic>)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">L</td>
<td align="center">S</td>
<td align="char" char=".">0.000</td>
</tr>
<tr>
<td align="left">L</td>
<td align="center">I</td>
<td align="char" char=".">0.033</td>
</tr>
<tr>
<td align="left">L</td>
<td align="center">F</td>
<td align="char" char=".">0.033</td>
</tr>
<tr>
<td align="left">M</td>
<td align="center">S</td>
<td align="char" char=".">0.000</td>
</tr>
<tr>
<td align="left">M</td>
<td align="center">I</td>
<td align="char" char=".">0.033</td>
</tr>
<tr>
<td align="left">M</td>
<td align="center">F</td>
<td align="char" char=".">0.067</td>
</tr>
<tr>
<td align="left">H</td>
<td align="center">S</td>
<td align="char" char=".">0.000</td>
</tr>
<tr>
<td align="left">H</td>
<td align="center">I</td>
<td align="char" char=".">0.067</td>
</tr>
<tr>
<td align="left">H</td>
<td align="center">F</td>
<td align="char" char=".">0.100</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s6-2">
<title>6.2 Performance Measure</title>
<p>We introduce the performance measure used to evaluate the planning performance of our methods: difference and average difference in discounted returns. The discounted return of episode <italic>e</italic>, called <italic>&#x3c1;</italic>
<sub>
<italic>e</italic>
</sub>, is the sum of the discounted rewards collected in all steps of that episode. The difference between the discounted return obtained using two different methods, such as EXT with the learned MRF and STD or ADA and EXT, on episode <italic>e</italic> is called &#x394;<italic>&#x3c1;</italic>
<sub>
<italic>e</italic>
</sub>. The average of this difference over all episodes of all runs is called <inline-formula id="inf78">
<mml:math id="m84">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>. Notice that the difference is computed episode by episode to reduce the randomness, and the average is computed across all the episodes of each run. Indeed, the discounted return depends on the state of the episode; then, it could have very different values over the episodes and the distribution of these data would be very large. By computing the mean of the difference on each episode, we always compare the performance of the two algorithms in the same state, thus obtaining a low standard deviation value as a result of the reduced level of uncertainty.</p>
</sec>
<sec id="s6-3">
<title>6.3 Test on MRF Learning</title>
<p>We introduce the experimental setting used in our tests on the MRF learning method and then present our empirical analysis results.</p>
<sec id="s6-3-1">
<title>6.3.1 Experimental Setting</title>
<p>We perform tests using the MRF learning algorithm (<xref ref-type="sec" rid="s5-1">Section 5.1</xref>) with the stopping criterion (<xref ref-type="sec" rid="s5-2">Section 5.2</xref>) to learn the MRF. Experiments are performed on the rocksample domain described in <xref ref-type="sec" rid="s6-1-1">Section 6.1.1</xref> using a <italic>C&#x2b;&#x2b;</italic> simulator.</p>
<p>In this test, we first select a true MRF (i.e., a set of relationships among rock values; see <xref ref-type="fig" rid="F5">Figure 5A</xref>). Edge probabilities are always set to 0.9 in the true MRF. We perform <italic>NR &#x3d; 10</italic> runs. Hence, we compute 10 MRFs. In each run, we start preparing an empty MRF with the same topology (i.e., set of edges) as the true one (notice that our current method does not learn the topology of the MRF but only the potentials of an MRF with pre-defined topology). We learn the MRF potentials for several episodes determined by the stopping criterion with threshold <italic>&#x3b7;</italic> &#x3d; 0.01 and <italic>ce</italic> &#x3d; 3. The configuration of rock values changes with each episode satisfying the distribution defined by the true MRF. Then, we evaluate the performance of the learned MRF performing <italic>NE &#x3d; 100</italic> episodes with EXT and STD algorithms, comparing the discounted return of each episode and averaging it over all the runs. In each episode, the agent performs <italic>NS &#x3d; 60</italic> steps. The POMCP always uses 100,000 particles and performs the same number of simulations.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>
<bold>(A)</bold> True MRF topology with the equality probability constraints on its edges. <bold>(B)</bold> True MRF and average of the learned MRFs. Pink dots represent the values on the edges of the true MRF, whereas blue dots and lines correspond to the average edge values of the learned MRF and their standard deviations, respectively. <bold>(C)</bold> Difference of edge probability values during execution of the learning process until the convergence is reached in episode 27. The black line represents the convergence threshold. <bold>(D)</bold> The density of difference in discounted return from STD.</p>
</caption>
<graphic xlink:href="frobt-09-819107-g005.tif"/>
</fig>
<p>To prove that the introduction of the learned MRF provides a statistically significant improvement with respect to STD, we show that the average difference <inline-formula id="inf79">
<mml:math id="m85">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> between the discounted return obtained with EXT and the discounted return obtained with STD is significantly larger than zero. Notice that the difference is computed across all the NE &#x3d; 100 episodes of each run (i.e., over 1,000 episodes in total). More precisely, at episode <italic>e</italic>, we compute the difference of discounted return <italic>&#x3c1;</italic>
<sub>
<italic>e</italic>
</sub> as <inline-formula id="inf80">
<mml:math id="m86">
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>X</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>T</mml:mi>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula>. Then, we compute the average of these values over all the episodes of all the runs <italic>average discounted return</italic> <inline-formula id="inf81">
<mml:math id="m87">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s6-3-2">
<title>6.3.2 Results</title>
<p>The results we obtained using the C&#x2b;&#x2b; simulator are summarized in <xref ref-type="fig" rid="F5">Figure 5</xref>. The main result is represented by the average difference in discounted return, <inline-formula id="inf82">
<mml:math id="m88">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, achieved using the learned MRF in EXT with respect to STD that does not use any kind of prior knowledge. The value of <inline-formula id="inf83">
<mml:math id="m89">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> is 1.15 and corresponds to a performance improvement of 5.99% (<xref ref-type="fig" rid="F5">Figure 5D</xref>). The distribution and corresponding average difference is computed over 100 episodes and 10 runs. To verify that <inline-formula id="inf84">
<mml:math id="m90">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> is statistically different from zero, we perform the Student&#x2019;s <italic>t</italic>-test that confirms the statistical significance of the result as the <italic>p</italic>-value is lower than 0.05.</p>
<p>To explain the motivation for this improvement in <xref ref-type="fig" rid="F5">Figure 5B</xref>, we compare the true and the learned MRF. On the <italic>x</italic>-axis, we display the edges of the MRF topology, whereas, on the <italic>y</italic>-axis, we show edge probability values. With pink dots, we represent the values on the true MRF edges (i.e., that from which we sampled the state-variable configurations of the learning episodes), whereas blue dots and lines represent, respectively, the average values of the learned MRF and their standard deviations (where the average is computed over the 10 runs performed in the learning process). The picture shows that the similarity between learned MRFs and the true one is very high. Moreover, <xref ref-type="fig" rid="F5">Figure 5C</xref> depicts the trend of difference in probability values of all edges during a run of the learning process until it is stopped by the proposed criterion. In episode 25, on the <italic>x</italic>-axis, the difference in equality probabilities of all edges starts to be lower than 0.01, the threshold used in the stopping criterion. Since this condition persists in the next three episodes, the stopping criterion ends the learning phase in episode 27. Similar results with a different stopping criterion have been presented by <xref ref-type="bibr" rid="B61">Zuccotto et al. (2022</xref>).</p>
</sec>
</sec>
<sec id="s6-4">
<title>6.4 Tests on the ROS Architecture for MRF Learning</title>
<p>In this section, we test the ROS architecture for MRF learning and present the results of our empirical analysis performed using this architecture.</p>
<sec id="s6-4-1">
<title>6.4.1 Experimental Setting</title>
<p>We perform tests using the MRF learning algorithm (<xref ref-type="sec" rid="s5-1">Section 5.1</xref>) with the stopping criterion (<xref ref-type="sec" rid="s5-2">Section 5.2</xref>) to learn the MRF on the ROS architecture proposed in <xref ref-type="sec" rid="s5-3">Section 5.3</xref>. We perform our tests on the open-source multi-robot simulator Gazebo (<xref ref-type="bibr" rid="B23">Koenig and Howard, 2004</xref>), in which TurtleBot3 acts in the rocksample domain described in <xref ref-type="sec" rid="s6-1-1">Section 6.1.1</xref>.</p>
<p>In this test, we first select a true MRF (<xref ref-type="fig" rid="F6">Figure 6A</xref>). Edge probabilities are always set to the values on the edges of the true MRF topology. We perform <italic>NR &#x3d; 10</italic> runs. In each run, we start preparing an empty MRF with the same topology as the true one. We learn the MRF potentials on the Gazebo environment, running the learning algorithm for several episodes determined by the stopping criterion with threshold <italic>&#x3b7;</italic> &#x3d; 0.01 and <italic>ce</italic> &#x3d; 3. The configuration of rock values changes in each episode, satisfying the distribution defined by the true MRF shown in <xref ref-type="fig" rid="F6">Figure 6A</xref>. Then, we test the performance of the learned MRF performing <italic>NE &#x3d; 100</italic> episodes with EXT and STD, comparing the discounted return of each episode and averaging it over all the runs. The MRF we used is the average of the 10 MRFs obtained during the learning process. In each episode, the agent performs <italic>NS &#x3d; 60</italic> steps. The POMCP always uses <italic>NP &#x3d; 100,000</italic> particles and performs the same number of simulations.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>
<bold>(A)</bold> True MRF topology with the equality probability constraints on its edges. <bold>(B)</bold> True MRF and average of the learned MRFs. Pink dots represent the values on the edges of the true MRF, whereas blue dots and lines correspond to the average edge values of the learned MRF and their standard deviations, respectively. <bold>(C)</bold> Difference of edge probability values during execution of the learning process until the convergence is reached in episode 23. The black line represents the convergence threshold. <bold>(D)</bold> Density of difference in discounted return from STD.</p>
</caption>
<graphic xlink:href="frobt-09-819107-g006.tif"/>
</fig>
<p>To prove that the introduction of the learned MRF provides a statistically significant improvement with respect to STD, we show that the average difference <inline-formula id="inf85">
<mml:math id="m91">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> between the discounted return obtained with the MRF learned with the ROS-architecture on Gazebo and the discounted return obtained with STD on the same framework is significantly larger than zero. Notice that the difference is computed episode by episode, and the average is computed across all the NE &#x3d; 100 episodes of each run (i.e., over 1,000 episodes in total). More precisely, at episode <italic>e</italic>, we compute the difference of discounted return <italic>&#x3c1;</italic>
<sub>
<italic>e</italic>
</sub> as <inline-formula id="inf86">
<mml:math id="m92">
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>X</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>T</mml:mi>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula>. Then, we compute the average of these values over all the episodes of all the runs <italic>average discounted return</italic> <inline-formula id="inf87">
<mml:math id="m93">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s6-4-2">
<title>6.4.2 Results</title>
<p>The results we obtained using the Gazebo simulator are summarized in <xref ref-type="fig" rid="F6">Figure 6</xref>. The main result consists of the average difference of discounted return, <inline-formula id="inf88">
<mml:math id="m94">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, achieved using the learned MRF in EXT with respect to STD that does not use any kind of prior knowledge. The value of <inline-formula id="inf89">
<mml:math id="m95">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> is 1.28 and corresponds to a performance improvement of 5.88% (<xref ref-type="fig" rid="F6">Figure 6D</xref>). The distribution and corresponding average difference is computed over 100 episodes and 10 runs. To verify that <inline-formula id="inf90">
<mml:math id="m96">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> is statistically different from zero, we perform the Student&#x2019;s <italic>t</italic>-test that confirms the statistical significance of the result as the <italic>p</italic>-value is lower than 0.05.</p>
<p>What allows this improvement is visible in <xref ref-type="fig" rid="F6">Figure 6B</xref> in which we compare the true and the learned MRF. On the <italic>x</italic>-axis, we display the edges of the MRF topology, while on the <italic>y</italic>-axis, we show edge probability values. Pink dots represent the values on the true MRF edges (i.e., that from which we sampled the state-variable configurations of the learning episodes), whereas blue dots and lines represent, respectively, the average values of the learned MRF and their standard deviations (where the average is computed over the 10 runs performed in the learning process). The picture shows that the learned MRFs are very similar to the true one. Moreover, this also shows that using the proposed learning approach implemented in the ROS architecture allows us to learn accurate MRFs. <xref ref-type="fig" rid="F6">Figure 6C</xref> depicts the trend of difference in probability values of all edges during a run of the learning process until it is stopped by the proposed criterion. In episode 20, on the <italic>x</italic>-axis, the difference in equality probabilities of all edges starts to be lower than 0.01, the threshold used in the stopping criterion. Because this condition persists in the next three episodes, the stopping criterion ends the learning phase in episode 23.</p>
<p>To further clarify the learning process performed on the ROS-based architecture, we provide a video showing four learning episodes performed by a TurtleBot in the Gazebo simulator of the rocksample domain. <xref ref-type="fig" rid="F7">Figure 7</xref> shows a snapshot of the video. The mobile robot acting in the Gazebo environment is shown in <xref ref-type="fig" rid="F7">Figure 7A</xref>. When it performs a sensing action on a rock, a question mark appears in the cell containing the rock. This cell becomes green or red if the outcome of the sensing action identifies the rock as valuable or valueless. When the agent performs a sampling action on a rock, the cell in which the rock is posed turns blue to specify that the rock has been collected. <xref ref-type="fig" rid="F7">Figure 7B</xref> shows the true rock value configuration of the episode that satisfies the distribution defined by the true MRF. In <xref ref-type="fig" rid="F7">Figures 7C,D</xref>, we show the edge probability values of the learned MRF updated at the end of episode 23 and the ones of the true MRF we aim at learning. In <xref ref-type="fig" rid="F7">Figure 7E</xref>, we show the evolution of the edge probability values in the learned MRF, and when all the values reach the convergence in <xref ref-type="fig" rid="F7">Figure 7E</xref>, the learning process ends.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>
<bold>(A)</bold> Instance of the rocksample environment in which the TurtleBot acts during episode 23. <bold>(B)</bold> True rock values in episode 23. <bold>(C)</bold> MRF learned after 23 episodes. <bold>(D)</bold> True MRF, the one to be learned. <bold>(E)</bold> Evolution of edge probability values at the end of episode 23. When they converge, the learning process is ended.</p>
</caption>
<graphic xlink:href="frobt-09-819107-g007.tif"/>
</fig>
</sec>
</sec>
<sec id="s6-5">
<title>6.5 Test on MRF Adaptation</title>
<p>We introduce the experimental setting used in our tests on ADA and then present the results of our empirical analysis.</p>
<sec id="s6-5-1">
<title>6.5.1 Experimental Setting</title>
<p>We perform two tests to evaluate the ADA method described in <xref ref-type="sec" rid="s5-4">Section 5.4</xref>: one is performed on rocksample (5,8) and the second on velocity regulation. In both cases, a C&#x2b;&#x2b; simulator of the environment has been used to avoid the slowdown introduced by Gazebo because the physics of the environment is not fundamental to evaluating this algorithm. The goal of our tests is to highlight that by using ADA, we can, on average, improve the performance of the planner over both STD and EXT. This improvement is achieved by limiting the performance decrease generated when the learned or given by expert MRF is used on episodes characterized by unlikely state-variable configurations. In both tests, we perform <italic>NR &#x3d; 10</italic> runs using an MRF that reflects probabilistic equality constraints among state-variable values learned using the MRF learning method of <xref ref-type="sec" rid="s5-1">Section 5.1</xref>. To evaluate the performance of ADA, we perform <italic>NE &#x3d; 100</italic> episodes using the MRF adaptation approach every time a discrepancy is detected during an episode and <italic>NE &#x3d; 100</italic> episodes using the EXT algorithm. Then, we compare the discounted return of the two methods considering only the episodes in which the MRF adaptation approach has been used and average it over all the runs. In each episode, the agent performs <italic>NS &#x3d; 60</italic> steps in the rocksample domain, whereas in the velocity regulation environment, it performs for <italic>NS &#x3d; 32</italic> steps, namely, the number of subsegments in the path. The configuration of rock values (for the test on rocksample) and segment difficulties (for the test on velocity regulation) changes with each episode satisfying the distribution defined by the true MRF. The POMCP always uses <italic>NP &#x3d; 100,000</italic> particles and performs the same number of simulations. We summarize the parameters used in our tests in <xref ref-type="table" rid="T4">Table 4</xref>.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Parameters of tests on ADA.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Environment</th>
<th align="center">NR</th>
<th align="center">NE</th>
<th align="center">NS</th>
<th align="center">NP</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Rocksample</td>
<td align="char" char=".">10</td>
<td align="char" char=".">100</td>
<td align="char" char=".">60</td>
<td align="char" char=".">100,000</td>
</tr>
<tr>
<td align="left">Velocity regulation</td>
<td align="char" char=".">10</td>
<td align="char" char=".">100</td>
<td align="char" char=".">32</td>
<td align="char" char=".">100,000</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To prove that the use of the MRF adaptation method in POMCP provides a statistically significant improvement with respect to the use of the MRF without adaptation, we show that the average difference <inline-formula id="inf91">
<mml:math id="m97">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> between the discounted return obtained with ADA and the discounted return obtained with EXT is significantly larger than zero. Notice that the difference is computed episode by episode, and the average is computed across all the episodes of each run in which ADA is used. More precisely, at episode <italic>e</italic>, we compute the difference of discounted return as <inline-formula id="inf92">
<mml:math id="m98">
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mi>X</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula>. Then, we compute the average of these values over all the episodes of all the runs <italic>average discounted return</italic> <inline-formula id="inf93">
<mml:math id="m99">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s6-5-2">
<title>6.5.2 Results</title>
<p>
<xref ref-type="fig" rid="F8">Figure 8</xref> and <xref ref-type="table" rid="T5">Table 5</xref> summarize the results of the two environments.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Density of difference in discounted return between <bold>(A)</bold> ADA and EXT on rocksample. <bold>(B)</bold> ADA and STD on rocksample. <bold>(C)</bold> ADA and EXT on velocity regulation. <bold>(D)</bold> ADA and STD on velocity regulation.</p>
</caption>
<graphic xlink:href="frobt-09-819107-g008.tif"/>
</fig>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Performance of ADA.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Environment</th>
<th align="center">Comparison</th>
<th align="center">
<inline-formula id="inf94">
<mml:math id="m100">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>%</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<italic>p</italic>-value</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Rocksample</td>
<td align="center">ADA&#x2014;EXT</td>
<td align="center">1.35 (6.54%)</td>
<td align="center">5.68 &#xd7; 10<sup>&#x2013;5</sup>
</td>
</tr>
<tr>
<td align="left"/>
<td align="center">ADA&#x2014;STD</td>
<td align="center">1.62 (7.46%)</td>
<td align="center">4.37 &#xd7; 10<sup>&#x2013;22</sup>
</td>
</tr>
<tr>
<td align="left">Velocity regulation</td>
<td align="center">ADA&#x2014;EXT</td>
<td align="center">1.04 (3.51%)</td>
<td align="center">8.70 &#xd7; 10<sup>&#x2013;7</sup>
</td>
</tr>
<tr>
<td align="left"/>
<td align="center">ADA&#x2014;STD</td>
<td align="center">1.35 (3.34%)</td>
<td align="center">5.82 &#xd7; 10<sup>&#x2013;8</sup>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s6-5-2-1">
<title>6.5.2.1 Rocksample</title>
<p>
<xref ref-type="fig" rid="F8">Figure 8A</xref> shows the distribution of the differences between the discounted returns obtained using ADA and those obtained using EXT. The distribution is computed considering the 162 episodes (out of 1,000, i.e., 100 episodes for 10 runs) in which the adaptation mechanism was activated (i.e., at least a discrepancy between the learned MRF and the true state has been detected). The average distance is <inline-formula id="inf95">
<mml:math id="m101">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.35</mml:mn>
</mml:math>
</inline-formula>, which corresponds to a 6.54% improvement (see the first line of <xref ref-type="table" rid="T5">Table 5</xref>). The <italic>p</italic>-value of the Student&#x2019;s <italic>t</italic>-test guarantees that this average is significantly different from zero (<xref ref-type="table" rid="T5">Table 5</xref>). <xref ref-type="fig" rid="F8">Figure 8B</xref> shows the distribution of the differences between the discounted returns obtained using ADA and those obtained using STD. This distribution is computed on 1,000 values (i.e., 100 episodes for 10 runs). The average is <inline-formula id="inf96">
<mml:math id="m102">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.62</mml:mn>
</mml:math>
</inline-formula>, which corresponds to a 7.46% improvement (see the second line of <xref ref-type="table" rid="T5">Table 5</xref>). Also, in this case, the <italic>p</italic>-value of the Student&#x2019;s <italic>t</italic>-test guarantees that this average is significantly different from zero (<xref ref-type="table" rid="T5">Table 5</xref>). Therefore, we can state that the improvement is, on average, statistically significant.</p>
</sec>
<sec id="s6-5-2-2">
<title>6.5.2.2 Velocity Regulation</title>
<p>The experiments performed on the velocity regulation domain confirm the positive results obtained on the rocksample. <xref ref-type="fig" rid="F8">Figure 8C</xref> shows the distribution of the differences between the discounted returns obtained using ADA and the ones obtained using EXT. The distribution is computed considering 714 episodes (out of 1,000), that is, the number of episodes in which the adaptation mechanism was activated. The average difference is <inline-formula id="inf97">
<mml:math id="m103">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.04</mml:mn>
</mml:math>
</inline-formula>, corresponding to a 3.51% performance improvement (third line of <xref ref-type="table" rid="T5">Table 5</xref>). This average is significantly different from zero because the <italic>p</italic>-value of the Student&#x2019;s <italic>t</italic>-test is lower than 0.05 (<xref ref-type="table" rid="T5">Table 5</xref>). <xref ref-type="fig" rid="F8">Figure 8D</xref> shows the distribution of the differences between the discounted return obtained with ADA and the ones obtained with STD. In this case, the distribution is computed on 1,000 values (i.e., all the 100 episodes for all the 10 runs). The average <inline-formula id="inf98">
<mml:math id="m104">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1.35</mml:mn>
</mml:math>
</inline-formula> and it corresponds to an improvement of 3.34% (fourth line of <xref ref-type="table" rid="T5">Table 5</xref>). Furthermore, in this case, the <italic>p</italic>-value of the Student&#x2019;s <italic>t</italic>-test guarantees that the value of <inline-formula id="inf99">
<mml:math id="m105">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> is statistically different from zero (<xref ref-type="table" rid="T5">Table 5</xref>). Thus, the performance improvement obtained is, on average, statistically significant.</p>
<p>Finally, to highlight the different behavior of ADA compared to EXT, in <xref ref-type="fig" rid="F9">Figure 9A</xref>, we show the behavior of ADA (on the left) and EXT (on the right) in a specific episode in which ADA is used and gives a performance improvement. Instead, in <xref ref-type="fig" rid="F9">Figure 9D</xref>, we show the behavior of a specific episode in which the use of ADA yields a decrease in performance. In each figure, we represent on the left grid the actions performed by the agent using ADA, whereas, on the right grid, we represent its action using the learned MRF. To denote the presence of a rock in a specific cell, we use its ID (from 1 to 8). The agent&#x2019;s starting position is represented by the light blue circle, and blue arrows indicate the path traveled by the agent. In pink-bordered boxes, we indicate the ID of the rock that the agent senses from a cell. With green boxes and red triangles we, respectively, represent the fact that the agent samples a valuable or valueless rock in the corresponding cell. Finally, the orange lightning means that a discrepancy is detected and that the adaptation approach is used as previously described in <xref ref-type="sec" rid="s5-4">Section 5.4</xref>.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>
<bold>(A)</bold> Relevant actions of the execution traces of an episode with a <italic>positive</italic> &#x394;<italic>&#x3c1;</italic>
<sub>
<italic>e</italic>
</sub> between ADA (on the left) and EXT (on the right). <bold>(B)</bold> Initial MRFs (learnt). <bold>(C)</bold> Adapted MRF, <inline-formula id="inf100">
<mml:math id="m106">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3,4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:math>
</inline-formula>. <bold>(D)</bold> Relevant actions of the execution traces of an episode with a <italic>negative</italic> &#x394;<italic>&#x3c1;</italic>
<sub>
<italic>e</italic>
</sub> between ADA (on the left) and the use of EXT (on the right). <bold>(E)</bold> Initial MRFs (learnt). <bold>(F)</bold> MRF after the first adaptation, <inline-formula id="inf101">
<mml:math id="m107">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>2,3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:math>
</inline-formula>. <bold>(G)</bold> MRF after the second adaptation, <inline-formula id="inf102">
<mml:math id="m108">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>5,6</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:math>
</inline-formula>.</p>
</caption>
<graphic xlink:href="frobt-09-819107-g009.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F9">Figure 9A</xref> shows lighting in cell (2,2) of the left grid. The learned MRF (<xref ref-type="fig" rid="F9">Figure 9B</xref>) expresses a high equality probability between rock 4 and rock 3 <inline-formula id="inf103">
<mml:math id="m109">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3,4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.90</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Thus, after the valuable rock 4 has been collected, the agent is encouraged to also sample rock 3. In the true state-variable configuration, instead, rock 3 is valueless; thus, a discrepancy with the learned MRF is detected. Then, the probability value on the edge <inline-formula id="inf104">
<mml:math id="m110">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>3,4</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is set to 0 because the assignments of rock 3 and 4 are different (<xref ref-type="fig" rid="F9">Figure 9C</xref>). Afterward, the particle filter is re-initialized according to <xref ref-type="statement" rid="algorithm_2">Algorithm 2</xref> and the belief re-computed. The positive effect of the proposed method is clearly visible because the agent does not sample rocks 1 and 2. Rock 2, in fact, is related to (valueless) rock 3 by an equality probability of 0.89 (on average) in the learned MRF, so the agent is not encouraged to sample rock 2 (<xref ref-type="fig" rid="F9">Figure 9C</xref>). Rock 1, in turn, is related to rock 2 by <inline-formula id="inf105">
<mml:math id="m111">
<mml:mi mathvariant="script">P</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn>1,2</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.88</mml:mn>
</mml:math>
</inline-formula>; thus, the agent does not sample rock 1. For the same reason, rocks 5 and 6 are sampled due to the equality probability that relates the assignment of rock 5 to the valuable rock 4 and the one that relates the assignment of rock 5 to the value of rock 6 (both probabilities are 0.9 on average). On the right grid of <xref ref-type="fig" rid="F9">Figure 9A</xref>, instead, we see what happens when the learned MRF, despite its correctness in probabilistic terms, does not reflect the state-variable configuration in the specific episode at hand. The agent samples the valueless rock 3 then; because its knowledge about the environment does not change, the agent also samples rocks 1 and 2, both valueless. In this episode, ADA allows limiting the negative effect of a misleading MRF obtaining a &#x394;<italic>&#x3c1;</italic>
<sub>
<italic>e</italic>
</sub> of 14.13.</p>
<p>In <xref ref-type="fig" rid="F9">Figure 9D</xref>, instead, we depict the most relevant agent actions of an episode in which ADA performs worse than EXT. On the left grid, we show that two discrepancies with the learned MRF (<xref ref-type="fig" rid="F9">Figure 9E</xref>) are detected, respectively, in cell (2,5) and (1,4). Thus, ADA is used twice in this episode. The effect of the first usage of ADA (<xref ref-type="fig" rid="F9">Figure 9F</xref>) consists of discouraging the agent from sampling rock 1, whereas the second (<xref ref-type="fig" rid="F9">Figure 9G</xref>) does not influence any other sampling action because rock 5 has already been sampled and no other state variable has equality relationships with rock 6. In the right grid, the agent performs a sensing action on rock 6 that returns a negative response discouraging the agent from sampling the rock. The different behavior of the agents about rock 6 gives a negative value for &#x394;<italic>&#x3c1;</italic>
<sub>
<italic>e</italic>
</sub>, which is &#x2212;4.63.</p>
</sec>
</sec>
</sec>
</sec>
<sec id="s7">
<title>7 Conclusion and Future Work</title>
<p>We presented three main contributions to the literature: a methodology for learning state-variable relationships in POMCP in the form of an MRF, an algorithm for adapting the MRF to the true states encountered while using the MRF in POMCP, and a ROS-based architecture that allows running the MRF learning and the POMCP with the MRF on real robotic platforms. Results show that the MRF adaptation algorithm achieves a statistically significant performance improvement over the use of the MRF without adaptation. Moreover, using the proposed architecture, we managed to learn informative MRFs that yield statistically significant performance improvement over standard POMCP. Our future work will focus on two main directions. From a methodological point of view, an interesting problem concerns integrating the learning process into the context of information gain problems on POMDPs. The goal, in that case, is to tune the exploration-exploitation trade-off considering the learning of the MRF. From an application viewpoint, we aim to extend the proposed ROS architecture to support other kinds of platforms, such as robotic manipulators, to assess our method on different problems that can be formalized as POMDPs.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s8">
<title>Data Availability Statement</title>
<p>Code availability: <ext-link ext-link-type="uri" xlink:href="https://github.com/kriato/pomcp_mrf_ros">https://github.com/kriato/pomcp_mrf_ros</ext-link>.</p>
<p>The original contributions presented in the study are included in the article/<xref ref-type="sec" rid="s13">Supplementary Material</xref>. Further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec id="s9">
<title>Author Contributions</title>
<p>MZ and AC: conceptualization, methodology, and writing&#x2014;review and editing. MP: software, visualization, and writing&#x2014;review and editing. EM: visualization and writing&#x2014;review and editing. AF: supervision, project administration, funding acquisition, conceptualization, and writing&#x2014;review and editing.</p>
</sec>
<sec id="s10">
<title>Funding</title>
<p>The research has been partially supported by the projects &#x201c;Dipartimenti di Eccellenza 2018-2022&#x201d;, funded by the Italian Ministry of Education, Universities and Research (MIUR), and &#x201c;SAFEPLACE, POR-FESR 2014-2020&#x201d;, funded by Regione del Veneto.</p>
</sec>
<sec sec-type="COI-statement" id="s11">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s13">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/frobt.2022.819107/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/frobt.2022.819107/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="Video1.MP4" id="SM1" mimetype="application/MP4" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<fn-group>
<fn id="fn1">
<label>1</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://github.com/kriato/pomcp_mrf_ros">https://github.com/kriato/pomcp_mrf_ros</ext-link>
</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Koller</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ng</surname>
<given-names>A. Y.</given-names>
</name>
</person-group> (<year>2006</year>). <article-title>Learning Factor Graphs in Polynomial Time and Sample Complexity</article-title>. <source>J. Mach. Learn. Res.</source> <volume>7</volume>, <fpage>1743</fpage>&#x2013;<lpage>1788</lpage>. </citation>
</ref>
<ref id="B2">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Amato</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Oliehoek</surname>
<given-names>F. A.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Scalable Planning and Learning for Multiagent POMDPs</article-title>,&#x201d; in <conf-name>Proceedings of the AAAI15</conf-name>, <fpage>1995</fpage>&#x2013;<lpage>2002</lpage>. </citation>
</ref>
<ref id="B3">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Araya</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Buffet</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Thomas</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Charpillet</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>A Pomdp Extension with Belief-dependent Rewards</article-title>,&#x201d; in <source>Advances in Neural Information Processing Systems</source> (<publisher-loc>Red Hook, NY, USA</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>), <volume>Vol. 23</volume>. </citation>
</ref>
<ref id="B4">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Atrash</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pineau</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>A Bayesian Method for Learning Pomdp Observation Parameters for Robot Interaction Management Systems</article-title>,&#x201d; in <conf-name>In The POMDP Practitioners Workshop</conf-name>. </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Besag</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>1977</year>). <article-title>Efficiency of Pseudolikelihood Estimation for Simple Gaussian Fields</article-title>. <source>Biometrika</source> <volume>64</volume>, <fpage>616</fpage>&#x2013;<lpage>618</lpage>. <pub-id pub-id-type="doi">10.1093/biomet/64.3.616</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bishop</surname>
<given-names>C. M.</given-names>
</name>
</person-group> (<year>2006</year>). <source>Pattern Recognition and Machine Learning (Information Science and Statistics)</source>. <publisher-loc>Berlin, Germany</publisher-loc>: <publisher-name>Springer-Verlag</publisher-name>. </citation>
</ref>
<ref id="B7">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Brafman</surname>
<given-names>R. I.</given-names>
</name>
<name>
<surname>Bar-Sinai</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ashkenazi</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Performance Level Profiles: A Formal Language for Describing the Expected Performance of Functional Modules</article-title>,&#x201d; in <conf-name>2016 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name> (<publisher-loc>Daejeon, South Korea</publisher-loc>: <publisher-name>IEEE Press</publisher-name>), <fpage>1751</fpage>&#x2013;<lpage>1756</lpage>. <pub-id pub-id-type="doi">10.1109/IROS.2016.7759280</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Browne</surname>
<given-names>C. B.</given-names>
</name>
<name>
<surname>Powley</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Whitehouse</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Lucas</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Cowling</surname>
<given-names>P. I.</given-names>
</name>
<name>
<surname>Rohlfshagen</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2012</year>). <article-title>A Survey of Monte Carlo Tree Search Methods</article-title>. <source>IEEE Trans. Comput. Intell. AI Games</source> <volume>4</volume>, <fpage>1</fpage>&#x2013;<lpage>43</lpage>. <pub-id pub-id-type="doi">10.1109/TCIAIG.2012.2186810</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Castellini</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chalkiadakis</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Farinelli</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Influence of State-Variable Constraints on Partially Observable Monte Carlo Planning</article-title>,&#x201d; in <conf-name>Proceedings of the Twenty-Eighth International Joint Conference on Artificial Intelligence, IJCAI 2019 (ijcai.org)</conf-name>, <fpage>5540</fpage>&#x2013;<lpage>5546</lpage>. <pub-id pub-id-type="doi">10.24963/ijcai.2019/769</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Castellini</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Marchesini</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Farinelli</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Partially Observable Monte Carlo Planning with State Variable Constraints for Mobile Robot Navigation</article-title>. <source>Eng. Appl. Artif. Intell.</source> <volume>104</volume>, <fpage>104382</fpage>. <pub-id pub-id-type="doi">10.1016/j.engappai.2021.104382</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Castellini</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Marchesini</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Mazzi</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Farinelli</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Explaining the Influence of Prior Knowledge on POMCP Policies</article-title>,&#x201d; in <conf-name>Multi-Agent Systems and Agreement Technologies - 17th European Conference, EUMAS 2020, and 7th International Conference, AT 2020</conf-name>, <conf-loc>Thessaloniki, Greece</conf-loc>, <conf-date>September 14-15, 2020</conf-date>, <fpage>261</fpage>&#x2013;<lpage>276</lpage>. <comment>Revised Selected Papers (Springer), vol. 12520 of Lecture Notes in Computer Science</comment>. <pub-id pub-id-type="doi">10.1007/978-3-030-66412-1_17</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Dechter</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2003</year>). <source>Constraint Processing</source>. <publisher-loc>Burlington, Massachusetts</publisher-loc>: <publisher-name>Morgan Kaufmann Publishers Inc</publisher-name>. </citation>
</ref>
<ref id="B13">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Doshi-Velez</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2009</year>). &#x201c;<article-title>The Infinite Partially Observable Markov Decision Process</article-title>,&#x201d; in <source>Advances in Neural Information Processing Systems, NeurIPS 2009</source> (<publisher-loc>Red Hook, NY, USA</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>), <fpage>477</fpage>&#x2013;<lpage>485</lpage>. </citation>
</ref>
<ref id="B14">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Fischer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tas</surname>
<given-names>O. S.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Information Particle Filter Tree: An Online Algorithm for POMDPs with Belief-Based Rewards on Continuous Domains</article-title>,&#x201d; in <conf-name>Proceedings of the 37th International Conference on Machine Learning (PMLR), vol. 119 of Proceedings of Machine Learning Research</conf-name>, <fpage>3177</fpage>&#x2013;<lpage>3187</lpage>. </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Friston</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Hierarchical Models in the Brain</article-title>. <source>PLoS Comput. Biol.</source> <volume>4</volume>, <fpage>e1000211</fpage>&#x2013;<lpage>24</lpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1000211</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Giuliari</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Castellini</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Berra</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Bue</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Farinelli</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Cristani</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). &#x201c;<article-title>POMP&#x2b;&#x2b;: Pomcp-Based Active Visual Search in Unknown Indoor Environments</article-title>,&#x201d; in <conf-name>2021 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)</conf-name> (<publisher-loc>Prague, Czech Republic</publisher-loc>: <publisher-name>IEEE</publisher-name>). <pub-id pub-id-type="doi">10.1109/IROS51168.2021.9635866</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Goldhoorn</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Garrell</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Alqu&#xe9;zar</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Sanfeliu</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Continuous Real Time POMCP to Find-And-Follow People by a Humanoid Service Robot</article-title>,&#x201d; in <conf-name>2014 IEEE-RAS International Conference on Humanoid Robots</conf-name>, <fpage>741</fpage>&#x2013;<lpage>747</lpage>. <pub-id pub-id-type="doi">10.1109/HUMANOIDS.2014.7041445</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hauskrecht</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2000</year>). <article-title>Value-function Approximations for Partially Observable Markov Decision Processes</article-title>. <source>jair</source> <volume>13</volume>, <fpage>33</fpage>&#x2013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1613/jair.678</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kaelbling</surname>
<given-names>L. P.</given-names>
</name>
<name>
<surname>Littman</surname>
<given-names>M. L.</given-names>
</name>
<name>
<surname>Cassandra</surname>
<given-names>A. R.</given-names>
</name>
</person-group> (<year>1998</year>). <article-title>Planning and Acting in Partially Observable Stochastic Domains</article-title>. <source>Artif. Intell.</source> <volume>101</volume>, <fpage>99</fpage>&#x2013;<lpage>134</lpage>. <pub-id pub-id-type="doi">10.1016/S0004-3702(98)00023-X</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Katt</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Oliehoek</surname>
<given-names>F. A.</given-names>
</name>
<name>
<surname>Amato</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Bayesian Reinforcement Learning in Factored POMDPs</article-title>,&#x201d; in <conf-name>Proceedings of the 18th International Conference on Autonomous Agents and MultiAgent Systems (International Foundation for Autonomous Agents and Multiagent Systems), AAMAS 2019</conf-name>, <fpage>7</fpage>&#x2013;<lpage>15</lpage>. </citation>
</ref>
<ref id="B21">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Katt</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Oliehoek</surname>
<given-names>F. A.</given-names>
</name>
<name>
<surname>Amato</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Learning in POMDPs with Monte Carlo Tree Search</article-title>,&#x201d; in <conf-name>Proceedings of the 34th International Conference on Machine Learning - Volume 70 (JMLR.org), ICML&#x2019;17</conf-name>, <fpage>1819</fpage>&#x2013;<lpage>1827</lpage>. </citation>
</ref>
<ref id="B22">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kocsis</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Szepesv&#xe1;ri</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2006</year>). &#x201c;<article-title>Bandit Based Monte-Carlo Planning</article-title>,&#x201d; in <conf-name>Proceedings of the 17th European Conference on Machine Learning. ECML 2006</conf-name>, <fpage>282</fpage>&#x2013;<lpage>293</lpage>. <pub-id pub-id-type="doi">10.1007/11871842_29</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Koenig</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Howard</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2004</year>). &#x201c;<article-title>Design and Use Paradigms for Gazebo, an Open-Source Multi-Robot Simulator</article-title>,&#x201d; in <conf-name>2004 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS) (IEEE Cat. No.04CH37566). vol. 3</conf-name>, <fpage>2149</fpage>&#x2013;<lpage>2154</lpage>. <pub-id pub-id-type="doi">10.1109/IROS.2004.1389727</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lauri</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ritala</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Planning for Robotic Exploration Based on Forward Simulation</article-title>. <source>Robotics Aut. Syst.</source> <volume>83</volume>, <fpage>15</fpage>&#x2013;<lpage>31</lpage>. <pub-id pub-id-type="doi">10.1016/j.robot.2016.06.008</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Poupart</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Monte-carlo Tree Search for Constrained POMDPs</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>2018</volume>, <fpage>7934</fpage>&#x2013;<lpage>7943</lpage>. <pub-id pub-id-type="doi">10.1155/2018/7689549</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Leonetti</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Iocchi</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Stone</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>A Synthesis of Automated Planning and Reinforcement Learning for Efficient, Robust Decision-Making</article-title>. <source>Artif. Intell.</source> <volume>241</volume>, <fpage>103</fpage>&#x2013;<lpage>130</lpage>. <pub-id pub-id-type="doi">10.1016/j.artint.2016.07.004</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Marder-Eppstein</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Berger</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Foote</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Gerkey</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Konolige</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>The Office Marathon: Robust Navigation in an Indoor Office Environment</article-title>,&#x201d; in <conf-name>2010 IEEE International Conference on Robotics and Automation</conf-name>, <fpage>300</fpage>&#x2013;<lpage>307</lpage>. <pub-id pub-id-type="doi">10.1109/ROBOT.2010.5509725</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>McAllester</surname>
<given-names>D. A.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>1999</year>). &#x201c;<article-title>Approximate Planning for Factored Pomdps Using Belief State Simplification</article-title>,&#x201d; in <conf-name>Proceedings of the Fifteenth Conference on Uncertainty in Artificial Intelligence (Morgan Kaufmann Publishers Inc.), UAI&#x2019;99</conf-name>, <fpage>409</fpage>&#x2013;<lpage>416</lpage>. </citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Murphy</surname>
<given-names>K. P.</given-names>
</name>
</person-group> (<year>2012</year>). <source>Machine Learning: A Probabilistic Perspective</source>. <publisher-loc>Cambridge, MA, USA</publisher-loc>: <publisher-name>The MIT Press</publisher-name>. </citation>
</ref>
<ref id="B30">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ognibene</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Mirante</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Marchegiani</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Proactive Intention Recognition for Joint Human-Robot Search and Rescue Missions through Monte-Carlo Planning in Pomdp Environments</article-title>,&#x201d; in <conf-name>Social Robotics - 11th International Conference, ICSR 2019, Proceedings</conf-name> (<publisher-loc>Berlin, Germany</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>332</fpage>&#x2013;<lpage>343</lpage>. <comment>Lecture Notes in Computer Science</comment>. <pub-id pub-id-type="doi">10.1007/978-3-030-35888-4_31</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Papadimitriou</surname>
<given-names>C. H.</given-names>
</name>
<name>
<surname>Tsitsiklis</surname>
<given-names>J. N.</given-names>
</name>
</person-group> (<year>1987</year>). <article-title>The Complexity of Markov Decision Processes</article-title>. <source>Math. OR</source> <volume>12</volume>, <fpage>441</fpage>&#x2013;<lpage>450</lpage>. <pub-id pub-id-type="doi">10.1287/moor.12.3.441</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Pineau</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Roy</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Thrun</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2001</year>). &#x201c;<article-title>A Hierarchical Approach to Pomdp Planning and Execution</article-title>,&#x201d; in <conf-name>Workshop on Hierarchy and Memory in Reinforcement Learning (ICML)</conf-name>. </citation>
</ref>
<ref id="B33">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Pletscher</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ong</surname>
<given-names>C. S.</given-names>
</name>
<name>
<surname>Buhmann</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2009</year>). &#x201c;<article-title>Spanning Tree Approximations for Conditional Random Fields</article-title>,&#x201d; in <conf-name>Proceedings of the Twelth International Conference on Artificial Intelligence and Statistics (PMLR), vol. 5 of Proceedings of Machine Learning Research</conf-name>, <fpage>408</fpage>&#x2013;<lpage>415</lpage>. </citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ross</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pineau</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chaib-draa</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kreitmann</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>A Bayesian Approach for Learning and Planning in Partially Observable Markov Decision Processes</article-title>. <source>J. Mach. Learn. Res.</source> <volume>12</volume>, <fpage>1729</fpage>&#x2013;<lpage>1770</lpage>. </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ross</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pineau</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Paquet</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chaib-draa</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Online Planning Algorithms for Pomdps</article-title>. <source>Jair</source> <volume>32</volume>, <fpage>663</fpage>&#x2013;<lpage>704</lpage>. <pub-id pub-id-type="doi">10.1613/jair.2567</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Russell</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Norvig</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2010</year>). <source>Artificial Intelligence - A Modern Approach</source>. <edition>Third International Edition</edition>. <publisher-loc>London, UK</publisher-loc>: <publisher-name>Pearson Education</publisher-name>. </citation>
</ref>
<ref id="B37">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Salakhutdinov</surname>
<given-names>R. R.</given-names>
</name>
</person-group> (<year>2009</year>). &#x201c;<article-title>Learning in Markov Random Fields Using Tempered Transitions</article-title>,&#x201d; in <source>Advances in Neural Information Processing Systems, NeurIPS 2009</source> (<publisher-loc>Red Hook, NY, USA</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>), <volume>Vol. 22</volume>. </citation>
</ref>
<ref id="B38">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sanner</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2010</year>). <source>Relational Dynamic Influence Diagram Language (RDDL): Language Description</source>. <publisher-loc>Canberra, Australia</publisher-loc>: <publisher-name>Australian National University</publisher-name>. <comment>Unpublished Manuscript</comment>. </citation>
</ref>
<ref id="B39">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Shah</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Shah</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wornell</surname>
<given-names>G. W.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>On Learning Continuous Pairwise Markov Random Fields</article-title>,&#x201d; in <conf-name>The 24th International Conference on Artificial Intelligence and Statistics, AISTATS 2021</conf-name>, <conf-date>April 13-15, 2021</conf-date>, <fpage>1153</fpage>&#x2013;<lpage>1161</lpage>. <comment>Virtual Event (PMLR), vol. 130 of Proceedings of Machine Learning Research</comment>. </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Silver</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Maddison</surname>
<given-names>C. J.</given-names>
</name>
<name>
<surname>Guez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sifre</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>van den Driessche</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Mastering the Game of Go with Deep Neural Networks and Tree Search</article-title>. <source>Nature</source> <volume>529</volume>, <fpage>484</fpage>&#x2013;<lpage>489</lpage>. <pub-id pub-id-type="doi">10.1038/nature16961</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Silver</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Schrittwieser</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Simonyan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Antonoglou</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Guez</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Mastering the Game of Go without Human Knowledge</article-title>. <source>Nature</source> <volume>550</volume>, <fpage>354</fpage>&#x2013;<lpage>359</lpage>. <pub-id pub-id-type="doi">10.1038/nature24270</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Silver</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Veness</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Monte-Carlo Planning in Large POMDPs</article-title>,&#x201d; in <source>Advances in Neural Information Processing Systems, NeurIPS 2010</source> (<publisher-loc>Red Hook, NY, USA</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>), <volume>Vol. 23</volume>, <fpage>2164</fpage>&#x2013;<lpage>2172</lpage>. </citation>
</ref>
<ref id="B43">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Smith</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Simmons</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2004</year>). &#x201c;<article-title>Heuristic Search Value Iteration for POMDPs</article-title>,&#x201d; in <conf-name>Proceedings of the 20th Conference on Uncertainty in Artificial Intelligence (AUAI Press), UAI &#x2019;04</conf-name>, <fpage>520</fpage>&#x2013;<lpage>527</lpage>. </citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sondik</surname>
<given-names>E. J.</given-names>
</name>
</person-group> (<year>1978</year>). <article-title>The Optimal Control of Partially Observable Markov Processes over the Infinite Horizon: Discounted Costs</article-title>. <source>Operations Res.</source> <volume>26</volume>, <fpage>282</fpage>&#x2013;<lpage>304</lpage>. <pub-id pub-id-type="doi">10.1287/opre.26.2.282</pub-id> </citation>
</ref>
<ref id="B45">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Spaan</surname>
<given-names>M. T. J.</given-names>
</name>
<name>
<surname>Spaan</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2004</year>). &#x201c;<article-title>A Point-Based POMDP Algorithm for Robot Planning</article-title>,&#x201d; in <conf-name>Proceedings of the 2004 IEEE International Conference on Robotics and Automation, ICRA 2004</conf-name>, <conf-date>April 26 - May 1, 2004</conf-date> (<publisher-loc>New Orleans, LA, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2399</fpage>&#x2013;<lpage>2404</lpage>. <pub-id pub-id-type="doi">10.1109/ROBOT.2004.1307420</pub-id> </citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Spaan</surname>
<given-names>M. T. J.</given-names>
</name>
<name>
<surname>Vlassis</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Perseus: Randomized Point-Based Value Iteration for Pomdps</article-title>. <source>jair</source> <volume>24</volume>, <fpage>195</fpage>&#x2013;<lpage>220</lpage>. <pub-id pub-id-type="doi">10.1613/jair.1659</pub-id> </citation>
</ref>
<ref id="B47">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sridharan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wyatt</surname>
<given-names>J. L.</given-names>
</name>
<name>
<surname>Dearden</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2008</year>). &#x201c;<article-title>Hippo: Hierarchical Pomdps for Planning Information Processing and Sensing Actions on a Robot</article-title>,&#x201d; in <conf-name>International Conference on Automated Planning and Scheduling, (ICAPS) (AAAI)</conf-name>, <fpage>346</fpage>&#x2013;<lpage>354</lpage>. </citation>
</ref>
<ref id="B48">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Stachniss</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Grisetti</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Burgard</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2005</year>). &#x201c;<article-title>Information Gain-Based Exploration Using Rao-Blackwellized Particle Filters</article-title>,&#x201d; in <conf-name>Proceedings of Robotics: Science and Systems (RSS)</conf-name>. <pub-id pub-id-type="doi">10.15607/rss.2005.i.009</pub-id> </citation>
</ref>
<ref id="B49">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sutton</surname>
<given-names>R. S.</given-names>
</name>
<name>
<surname>Barto</surname>
<given-names>A. G.</given-names>
</name>
</person-group> (<year>2018</year>). <source>Reinforcement Learning: An Introduction</source>. <edition>Second edn</edition>. <publisher-loc>Cambridge, MA, USA</publisher-loc>: <publisher-name>The MIT Press</publisher-name>. </citation>
</ref>
<ref id="B50">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Theocharous</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Murphy</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kaelbling</surname>
<given-names>L. P.</given-names>
</name>
</person-group> (<year>2004</year>). &#x201c;<article-title>Representing Hierarchical Pomdps as Dbns for Multi-Scale Robot Localization</article-title>,&#x201d; in <conf-name>Proceedings of the 2004 IEEE International Conference on Robotics and Automation, ICRA 2004</conf-name>, <conf-date>April 26 - May 1, 2004</conf-date> (<publisher-loc>New Orleans, LA, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1045</fpage>&#x2013;<lpage>1051</lpage>. <pub-id pub-id-type="doi">10.1109/ROBOT.2004.1307288</pub-id> </citation>
</ref>
<ref id="B51">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Theocharous</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Rohanimanesh</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Maharlevan</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2001</year>). &#x201c;<article-title>Learning Hierarchical Observable Markov Decision Process Models for Robot Navigation</article-title>,&#x201d; in <conf-name>Proceedings 2001 ICRA. IEEE International Conference on Robotics and Automation (Cat. No.01CH37164). vol. 1</conf-name>, <fpage>511</fpage>&#x2013;<lpage>516</lpage>. <pub-id pub-id-type="doi">10.1109/ROBOT.2001.932601</pub-id> </citation>
</ref>
<ref id="B52">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Thomas</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Hutin</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Buffet</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Monte Carlo Information-Oriented Planning</article-title>,&#x201d; in <conf-name>ECAI 2020 - 24th European Conference on Artificial Intelligence</conf-name> (<publisher-loc>Amsterdam, Netherlands</publisher-loc>: <publisher-name>IOS Press</publisher-name>), <fpage>2378</fpage>&#x2013;<lpage>2385</lpage>. <comment>vol. 325 of Frontiers in Artificial Intelligence and Applications</comment>. <pub-id pub-id-type="doi">10.3233/FAIA200368</pub-id> </citation>
</ref>
<ref id="B53">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Thrun</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2000</year>). &#x201c;<article-title>Monte Carlo POMDPs</article-title>,&#x201d; in <source>Advances in Neural Information Processing Systems, NeurIPS 1999</source> (<publisher-loc>Cambridge, MA, USA</publisher-loc>: <publisher-name>MIT Press</publisher-name>), <fpage>1064</fpage>&#x2013;<lpage>1070</lpage>. </citation>
</ref>
<ref id="B54">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Upton</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Cook</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2008</year>). &#x201c;<article-title>A Dictionary of Statistics</article-title>,&#x201d; in <source>Oxford Paperback Reference</source> (<publisher-loc>Oxford</publisher-loc>: <publisher-name>OUP Oxford</publisher-name>). <pub-id pub-id-type="doi">10.1093/acref/9780199541454.001.0001</pub-id> </citation>
</ref>
<ref id="B55">
<citation citation-type="thesis">
<person-group person-group-type="author">
<name>
<surname>Veiga</surname>
<given-names>T. S.</given-names>
</name>
</person-group> (<year>2015</year>). <source>Information Gain and Value Function Approximation in Task Planning Using POMDPs</source>. <comment>Ph.D. thesis</comment> (<publisher-loc>Lisbon, Portugal</publisher-loc>: <publisher-name>Instituto Superior T&#xe9;cnico, Universidade de Lisboa</publisher-name>).</citation>
</ref>
<ref id="B56">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Veiga</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Spaan</surname>
<given-names>M. T. J.</given-names>
</name>
<name>
<surname>Lima</surname>
<given-names>P. U.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Point-Based POMDP Solving with Factored Value Function Approximation</article-title>,&#x201d; in <source>AAAI</source> (<publisher-loc>Palo Alto, California, U.S.</publisher-loc>: <publisher-name>AAAI Press</publisher-name>), <fpage>2513</fpage>&#x2013;<lpage>2519</lpage>. </citation>
</ref>
<ref id="B57">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Vuffray</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Misra</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lokhov</surname>
<given-names>A. Y.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Efficient Learning of Discrete Graphical Models</article-title>,&#x201d; in <source>Advances in Neural Information Processing Systems, NeurIPS 2020</source>. </citation>
</ref>
<ref id="B58">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Giuliari</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Berra</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Castellini</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bue</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Farinelli</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>POMP: Pomcp-Based Online Motion Planning for Active Visual Search in Indoor Environments</article-title>,&#x201d; in <conf-name>31st British Machine Vision Conference 2020, BMVC 2020</conf-name>, <conf-date>September 7-10, 2020</conf-date> (<publisher-loc>Virtual Event, UK</publisher-loc>: <publisher-name>BMVA Press</publisher-name>). </citation>
</ref>
<ref id="B59">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wertheim</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Brafman</surname>
<given-names>R. I.</given-names>
</name>
<name>
<surname>Shekhar</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Feiner</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Pinsky</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>ROS-POMDP &#x2013; A Platform for Robotics Planning Using PLPs and RDDL in ROS</article-title>,&#x201d; in <conf-name>Planning and Robotics Workshop, 30th International Conference on Automated Planning and Scheduling (ICAPS)</conf-name>. </citation>
</ref>
<ref id="B60">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Williams</surname>
<given-names>J. D.</given-names>
</name>
<name>
<surname>Young</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Partially Observable Markov Decision Processes for Spoken Dialog Systems</article-title>. <source>Comput. Speech &#x26; Lang.</source> <volume>21</volume>, <fpage>393</fpage>&#x2013;<lpage>422</lpage>. <pub-id pub-id-type="doi">10.1016/j.csl.2006.06.008</pub-id> </citation>
</ref>
<ref id="B61">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zuccotto</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Castellini</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Farinelli</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Learning State-Variable Relationships for Improving POMCP Performance</article-title>,&#x201d; in <conf-name>The 37th ACM/SIGAPP Symposium on Applied Computing (SAC &#x2019;22)</conf-name>, <conf-date>April 25&#x2013;29, 2022</conf-date>, <fpage>739</fpage>&#x2013;<lpage>747</lpage>. <comment>Virtual Event</comment>. <pub-id pub-id-type="doi">10.1145/3477314.3507049</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>