<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Control. Eng.</journal-id>
<journal-title>Frontiers in Control Engineering</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Control. Eng.</abbrev-journal-title>
<issn pub-type="epub">2673-6268</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1402621</article-id>
<article-id pub-id-type="doi">10.3389/fcteg.2024.1402621</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Control Engineering</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Using reinforcement learning to autonomously identify sources of error for agents in group missions</article-title>
<alt-title alt-title-type="left-running-head">Utimula et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fcteg.2024.1402621">10.3389/fcteg.2024.1402621</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Utimula</surname>
<given-names>Keishu</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2690017/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hayaschi</surname>
<given-names>Ken-taro</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Bihl</surname>
<given-names>Trevor J.</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hongo</surname>
<given-names>Kenta</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Maezono</surname>
<given-names>Ryo</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2861038/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Materials Science</institution>, <institution>Japan Advanced Institute of Science and Technology (JAIST)</institution>, <addr-line>Nomi</addr-line>, <addr-line>Ishikawa</addr-line>, <country>Japan</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>School of Information Science</institution>, <institution>Japan Advanced Institute of Science and Technology (JAIST)</institution>, <addr-line>Nomi</addr-line>, <addr-line>Ishikawa</addr-line>, <country>Japan</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Air Force Research Laboratory</institution>, <institution>Wright Patterson Air Force Base</institution>, <addr-line>Dayton</addr-line>, <addr-line>OH</addr-line>, <country>United States</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Research Center for Advanced Computing Infrastructure</institution>, <institution>Japan Advanced Institute of Science and Technology (JAIST)</institution>, <addr-line>Nomi</addr-line>, <addr-line>Ishikawa</addr-line>, <country>Japan</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/959863/overview">Antonio Visioli</ext-link>, University of Brescia, Italy</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1933239/overview">Zhongguo Li</ext-link>, The University of Manchester, United Kingdom</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1572316/overview">Yunduan Cui</ext-link>, Chinese Academy of Sciences (CAS), China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Keishu Utimula, <email>mwkumk1702@icloud.com</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>16</day>
<month>10</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>5</volume>
<elocation-id>1402621</elocation-id>
<history>
<date date-type="received">
<day>17</day>
<month>03</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>20</day>
<month>09</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Utimula, Hayaschi, Bihl, Hongo and Maezono.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Utimula, Hayaschi, Bihl, Hongo and Maezono</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>When deploying agents to execute a mission with collective behavior, it is common for accidental malfunctions to occur in some agents. It is challenging to distinguish whether these malfunctions are due to actuator failures or sensor issues based solely on interactions with the affected agent. However, we humans know that if we cause a group behavior where other agents collide with a suspected malfunctioning agent, we can monitor the presence or absence of a positional change and identify whether it is the actuator (position changed) or the sensor (position unchanged) that is broken. We have developed artificial intelligence that can autonomously deploy such &#x201c;information acquisition strategies through collective behavior&#x201d; using machine learning. In such problems, the goal is to plan collective actions that result in differences between the hypotheses for the state [<italic>e.g.</italic>, actuator or sensor]. Only a few of the possible collective behavior patterns will lead to distinguishing between hypotheses. The evaluation function to maximize the difference between hypotheses is therefore sparse, with mostly flat values across most of the domain. Gradient-based optimization methods are ineffective for this, and reinforcement learning becomes a viable alternative. By handling this maximization problem, our reinforcement learning surprisingly gets the optimal solution, resulting in collective actions that involve collisions to differentiate the causes. Subsequent collective behaviors, reflecting this situation awareness, seemed to involve other agents assisting the malfunctioning agent.</p>
</abstract>
<kwd-group>
<kwd>reinforcement learning</kwd>
<kwd>autonomous agents</kwd>
<kwd>failure detection and recovery</kwd>
<kwd>AI-based methods</kwd>
<kwd>task planning</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Control and Automation Systems</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Problem formulation</title>
<p>The group cooperation of agents is an important topic studied in the context of autonomous systems (<xref ref-type="bibr" rid="B21">Lee et al., 2018</xref>; <xref ref-type="bibr" rid="B19">Hu et al., 2020</xref>). Because it is likely that each agent will have individual biases in its actuator or sensor performance, it is an important autonomous ability to analyze these inherent biases and revise the control plan appropriately to continue the group mission. Such biases dynamically change over time during missions, occasionally leading to failures in some functions of an agent. When such changes occur, it is essential to promptly revise the transportation plan using methods such as reinforcement learning. However, this requires constructing a virtual environment that accurately reflects real-world conditions. Therefore, to properly update the operational plan using reinforcement learning, it is necessary to identify the causes of the biases, including any failures, in each agent as they occur.</p>
<p>Previous research on fault diagnosis methods in swarm robotic systems includes the work by O&#x27;Keeffe et al. (<xref ref-type="bibr" rid="B28">O&#x2019;Keeffe et al., 2018</xref>). This approach adopts a fault diagnosis mechanism, inspired by biological immune systems, that learns from past diagnostic results to efficiently identify malfunctions based on the behavior of robots. However, the diagnostic tests assumed here only target predictable faults and may struggle when multiple faults occur simultaneously. This difficulty in diagnosis is an inevitable challenge in the advancement of robotic development.</p>
<p>One of the factors complicating this diagnosis is the difficulty in distinguishing the causes of faults.</p>
<p>Suppose that a command base, which controls a group of agents via each command <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (<xref ref-type="fig" rid="F1">Figure 1A</xref>), has detected an anomaly in the position of an agent (<italic>e.g.,</italic> no change in the position was observed). There are two possible causes for the observed anomaly: (1) actuator failures (agent is unable to move) or (2) sensor failures (agent can move, but the move is not captured by the sensor) (<xref ref-type="fig" rid="F1">Figure 1B</xref>). Depending on the hypothesis [the failure may have occurred in the actuators <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> or sensors <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>], the plan is subsequently calibrated and updated accordingly. However, it is generally difficult to identify which problem caused the anomaly solely through communication between the base and the agent. An intuitive method to identify the correct hypothesis is to execute a collision to the failure agent by other agents to check whether any displacement is observed by the sensor. Such a collision should demonstrate agent displacement; sensor failure would not detect that displacement. Thus, the correct hypothesis can be identified by &#x201c;planning a group motion.&#x201d; The question then arises as to whether such planning can be set up autonomously as a &#x201c;strategy to acquire environmental information&#x201d; (<xref ref-type="bibr" rid="B11">Friston, 2010</xref>).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Agents perform group actions according to commands communicated from the &#x201c;control base&#x201d; (the figure depicts an example with three agents indexed by <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>). The wavy arrow denotes a command signal from the base, whereas the dotted arrows represent the return signals from each sensor on each agent [panel <bold>(A)</bold>]. When an anomaly is detected in a return signal, two hypotheses&#x2014;<inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> or <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>&#x2014;can be considered [panel <bold>(B)</bold>].</p>
</caption>
<graphic xlink:href="fcteg-05-1402621-g001.tif"/>
</fig>
<p>Such autonomous planning appears to be feasible given the following value function. Suppose that the command <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> is issued from the control base, directing the agent&#x2019;s action to specify which of the hypotheses (<inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) is supported (<xref ref-type="fig" rid="F1">Figure 1A</xref>). This command updates the agent state to <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>. The updated state <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> should be denoted as <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> because it depends on the hypothesis about the state before the update <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. As the expected results differ for different hypotheses, the following expression can be used to evaluate the distinction: <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>. To ensure appropriate planning <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> that involves collisions between agents, a non-zero difference <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is obtained, and the likelihood of each hypothesis can be determined. We must, therefore, formulate a plan that maximizes <inline-formula id="inf17">
<mml:math id="m17">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> to ensure a significant difference. Accordingly, an autonomous action plan can be formulated to maximize <inline-formula id="inf18">
<mml:math id="m18">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> as a value function.</p>
<p>However, this maximization task is difficult to complete via conventional gradient-based optimization. Owing to the wide range of possibilities for <inline-formula id="inf19">
<mml:math id="m19">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, interactions such as collisions are rare events, and for most of the planning phase <inline-formula id="inf20">
<mml:math id="m20">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf21">
<mml:math id="m21">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, it is impossible to distinguish between hypotheses. Namely, sub-spaces with finite <inline-formula id="inf22">
<mml:math id="m22">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are sparse in the overall state space (sparse rewards). In such cases, gradient-based optimization is insufficient for the task of formulating appropriate action plans because the zero-gradient encompasses the vast majority of the space. For such sparse reward optimization, reinforcement learning, which has been thoroughly investigated in the applications of autonomous systems (<xref ref-type="bibr" rid="B20">Huang et al., 2005</xref>; <xref ref-type="bibr" rid="B38">Xia and El Kamel, 2016</xref>; <xref ref-type="bibr" rid="B40">Zhu et al., 2018</xref>; <xref ref-type="bibr" rid="B19">Hu et al., 2020</xref>), can be used as an effective alternative.</p>
<p>Reinforcement learning (<xref ref-type="bibr" rid="B26">Nachum et al., 2018</xref>; <xref ref-type="bibr" rid="B35">Sutton and Barto, 2018</xref>; <xref ref-type="bibr" rid="B1">Barto, 2002</xref>) is becoming an established field in the wider context of robotics and system controls (<xref ref-type="bibr" rid="B29">Peng et al., 2018</xref>; <xref ref-type="bibr" rid="B9">Finn and Levine (2017)</xref>. Methodological improvements have been studied intensively, especially by verifications on gaming platforms (<xref ref-type="bibr" rid="B25">Mnih et al., 2015</xref>; <xref ref-type="bibr" rid="B32">Silver et al., 2017</xref>; <xref ref-type="bibr" rid="B36">Vinyals et al., 2019</xref>). Thus, the topic addressed in this study is becoming a subfield known as multi-agent reinforcement learning (MARL) (<xref ref-type="bibr" rid="B6">Busoniu et al., 2006</xref>; <xref ref-type="bibr" rid="B14">Gupta et al., 2017</xref>; <xref ref-type="bibr" rid="B34">Straub et al., 2020</xref>; <xref ref-type="bibr" rid="B3">Bihl et al., 2022</xref>; <xref ref-type="bibr" rid="B13">Gronauer and Diepold, 2021</xref>). Specific examples of multi-agent missions include unmanned aerial vehicles (UAV) (<xref ref-type="bibr" rid="B3">Bihl et al., 2022</xref>; <xref ref-type="bibr" rid="B34">Straub et al., 2020</xref>) and sensor resource management (SRM) <xref ref-type="bibr" rid="B23">Malhotra et al., 2017</xref>, <xref ref-type="bibr" rid="B22">1997</xref>; <xref ref-type="bibr" rid="B17">Hero and Cochran, 2011</xref>; <xref ref-type="bibr" rid="B3">Bihl et al., 2022</xref>). The objective of this study can also be regarded as the problem of handling non-stationary environments in multi-agent reinforcement learning (<xref ref-type="bibr" rid="B27">Nguyen et al., 2020</xref>; <xref ref-type="bibr" rid="B10">Foerster et al., 2017</xref>). As a consequence of failure, agents are vulnerable to the gradual loss of homogeneity. Prior studies have addressed the problem of heterogeneity in multi-agent reinforcement learning (<xref ref-type="bibr" rid="B6">Busoniu et al., 2006</xref>; <xref ref-type="bibr" rid="B7">Calvo and Dusparic, 2018</xref>; <xref ref-type="bibr" rid="B3">Bihl et al., 2022</xref>; <xref ref-type="bibr" rid="B34">Straub et al., 2020</xref>; <xref ref-type="bibr" rid="B13">Gronauer and Diepold, 2021</xref>). The problem of sparse rewards has also been recognized and discussed as one of the current challenges in reinforcement learning (<xref ref-type="bibr" rid="B37">Wang and Taylor, 2017</xref>; <xref ref-type="bibr" rid="B3">Bihl et al., 2022</xref>). Recent advancements in reinforcement learning have introduced various innovative methods for handling single-agent or multi-agent scenarios. These approaches have focused on improving sample efficiency, computational costs, and learning stability across different frameworks. One such method is TD7, which utilizes state-action learned embeddings (SALE) for jointly learning embeddings of both states and actions <xref ref-type="bibr" rid="B12">Fujimoto et al., 2024</xref>). CrossQ is another approach that improves sample efficiency while significantly reducing computational costs by utilizing batch normalization (<xref ref-type="bibr" rid="B2">Bhatt et al., 2019</xref>). Continuous dynamic policy programming (CDPP) extends relative entropy regularized reinforcement learning from value function-based frameworks to actor-critic structures in continuous action spaces, achieving improved sample efficiency and learning stability (<xref ref-type="bibr" rid="B31">Shang et al., 2023</xref>). Furthermore, dropout Q-functions (DroQ) employs a small ensemble of dropout Q-functions to enhance computational efficiency while maintaining sample efficiency comparable to randomized ensembled double Q-learning (REDQ) (<xref ref-type="bibr" rid="B18">Hiraoka et al., 2021</xref>). In the realm of multi-agent reinforcement learning, multi-agent continuous dynamic policy gradient (MACDPP) has achieved high learning capability and sample efficiency by introducing relative entropy regularization to the centralized training with decentralized execution (CTDE) framework <xref ref-type="bibr" rid="B24">Miao et al., 2024</xref>).</p>
<p>The discussion thus far can be generalized as follows: Consider a scenario involving <inline-formula id="inf23">
<mml:math id="m23">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> agents where some anomalies occur, and multiple hypotheses are conceivable. For instance, similar to the earlier example, there could be cases where only a sensor or only an actuator fails in a single agent. Alternatively, there could be scenarios involving multiple agents where anomalies occur in several sensors and actuators, among other various cases. Furthermore, let <inline-formula id="inf24">
<mml:math id="m24">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> denote the state of these <inline-formula id="inf25">
<mml:math id="m25">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> agents, which could be a vector obtained by concatenating the coordinates of <inline-formula id="inf26">
<mml:math id="m26">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> robots. Under hypothesis <inline-formula id="inf27">
<mml:math id="m27">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the state <inline-formula id="inf28">
<mml:math id="m28">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is updated by a command <inline-formula id="inf29">
<mml:math id="m29">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to a new state <inline-formula id="inf30">
<mml:math id="m30">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The difference between the states under hypotheses <inline-formula id="inf31">
<mml:math id="m31">
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf32">
<mml:math id="m32">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> can be expressed as <inline-formula id="inf33">
<mml:math id="m33">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, similar to earlier. If a virtual environment that faithfully reproduces these agents&#x2019; behavior is prepared, and <inline-formula id="inf34">
<mml:math id="m34">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> that maximizes <inline-formula id="inf35">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> can be found through reinforcement learning, executing <inline-formula id="inf36">
<mml:math id="m36">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in real systems and observing the outcomes would allow for discrimination between hypotheses. To search for a <inline-formula id="inf37">
<mml:math id="m37">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> that simultaneously discriminates all hypotheses, reinforcement learning should be conducted to maximize the sum of <inline-formula id="inf38">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> across all combinations of hypotheses.</p>
<p>As a prototype of such a problem, we considered a system composed of three agents moving on an <inline-formula id="inf39">
<mml:math id="m39">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>-plane, administrated by a command base to perform a cooperative task (<xref ref-type="fig" rid="F2">Figure 2</xref>). In performing the task, each agent is asked to convey an item to a goal post individually. The second agent (&#x23;2) is assumed to be unable to move along the <inline-formula id="inf40">
<mml:math id="m40">
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-direction due to an actuator failure. By quickly verifying tiny displacements in each agent, the command base can detect the problem occurring in &#x23;2. However, it cannot attribute the cause to either the actuators or the sensors. Consequently, the control base sets hypotheses <inline-formula id="inf41">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf42">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and begins planning the best cooperative motions <inline-formula id="inf43">
<mml:math id="m43">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> to classify the correct hypothesis via reinforcement learning.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>View of actual machines labeled as Agents &#x23;1&#x2013;&#x23;3. Agent &#x23;2 is unable to move in the <italic>y</italic>-direction due to actuator failure. Agents &#x23;1 and &#x23;3 are on their way to rescue Agent&#x23;2 (see the main text about how the AI determines the action plan for the recovery of Agent &#x23;2).</p>
</caption>
<graphic xlink:href="fcteg-05-1402621-g002.tif"/>
</fig>
<p>Remarkably, the optimal action plan generated by reinforcement learning showed a human-like solution to pinpoint the problem by colliding other agents with the failed agent. By inducing a collision, the base could identify that &#x23;2 is experiencing problems with its actuators rather than sensors. The base then starts planning group motions to complete the conveying task, considering the limited functionality of &#x23;2. We observe that the cooperative tasks are facilitated by a learning process wherein other agents appear to compensate for the deficiency of &#x23;2 by pushing it toward the goal. In the present study, we employed a simple prototype system to demonstrate that reinforcement learning is extremely effective in setting up a verification plan that pinpoints multiple hypotheses for general cases of system failure.</p>
</sec>
<sec sec-type="methods" id="s2">
<title>2 Methodology</title>
<p>Let the state space for the agents be <inline-formula id="inf44">
<mml:math id="m44">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. For instance, given three agents <inline-formula id="inf45">
<mml:math id="m45">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1,2,3</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> situated on a <inline-formula id="inf46">
<mml:math id="m46">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-plane at positions <inline-formula id="inf47">
<mml:math id="m47">
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, their states can be specified as <inline-formula id="inf48">
<mml:math id="m48">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>; that is, points in six-dimensional space. The state is driven by a command <inline-formula id="inf49">
<mml:math id="m49">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> according to the operation plan generated in the command base. When <inline-formula id="inf50">
<mml:math id="m50">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is assigned to a given <inline-formula id="inf51">
<mml:math id="m51">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the state is updated depending on which hypothesis <inline-formula id="inf52">
<mml:math id="m52">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is taken, each of which restricts <inline-formula id="inf53">
<mml:math id="m53">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> by individual constraint:<disp-formula id="equ1">
<mml:math id="m54">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>:</mml:mo>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>The difference<disp-formula id="equ2">
<mml:math id="m55">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</disp-formula>can then be the measure to evaluate performance and thereby distinguish between the hypotheses. Therefore, the best operation plan for the distinction should be determined as<disp-formula id="equ3">
<mml:math id="m56">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2a;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="normal">arg&#x2009;max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mspace width="0.3333em"/>
<mml:mi>D</mml:mi>
<mml:mspace width="0.3333em"/>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The naive idea of performing optimization using gradient-based methods is insufficient, owing to the sparseness described in the introduction. For <inline-formula id="inf54">
<mml:math id="m57">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf55">
<mml:math id="m58">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, the gradient is zero for most of <inline-formula id="inf56">
<mml:math id="m59">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> because it is incapable of selecting the next update. Accordingly, we employed reinforcement learning as an alternative optimization approach.</p>
<p>Suppose we can evaluate the reward <inline-formula id="inf57">
<mml:math id="m60">
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> for the action <inline-formula id="inf58">
<mml:math id="m61">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> to be taken for a given state <inline-formula id="inf59">
<mml:math id="m62">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. In reinforcement learning, decisions aim to maximize the action value <inline-formula id="inf60">
<mml:math id="m63">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, rather than maximizing the immediate reward <inline-formula id="inf61">
<mml:math id="m64">
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>. Although the reward <inline-formula id="inf62">
<mml:math id="m65">
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> indicates the benefit obtained at that moment, the action value <inline-formula id="inf63">
<mml:math id="m66">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> represents the benefit accumulated over the future. The governing equation that links the given <inline-formula id="inf64">
<mml:math id="m67">
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> with <inline-formula id="inf65">
<mml:math id="m68">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> is known as the Bellman equation, being expressed in self-consistent manner. Users specify the reward function <inline-formula id="inf66">
<mml:math id="m69">
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> and the detailed specifications of the Bellman equation to self-consistently determine the action value <inline-formula id="inf67">
<mml:math id="m70">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> using a library. In this study, we employed the OpenAI Gym (<xref ref-type="bibr" rid="B5">Brockman et al., 2016</xref>) as such a library. Though the details of the reinforcement learning implementation are found in the general literature, we provide further details using our notation adopted in this paper in Section 1 of the <xref ref-type="sec" rid="s11">Supplementary Material</xref>. In this research, the operational plans are finally determined by the converged action value, <inline-formula id="inf68">
<mml:math id="m71">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, obtained by the self-consistent iterations as<disp-formula id="e1">
<mml:math id="m72">
<mml:mrow>
<mml:mtable class="eqnarray">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi>arg</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2009;max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left"/>
<mml:mtd columnalign="left">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mi>arg</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x2009;max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left"/>
<mml:mtd columnalign="left">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mo>&#x22ef;</mml:mo>
</mml:mtd>
<mml:mtd columnalign="left"/>
<mml:mtd columnalign="left">
<mml:mo>.</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
</sec>
<sec id="s3">
<title>3 Experiments</title>
<p>The workflow required to achieve the mission for the agents, as described in &#xa7;1, proceeds as follows:<list list-type="simple">
<list-item>
<p>[0a ] To determine if there are errors found in any of the agents, the base issues commands to move all agents by tiny displacements (and consequently, Agent &#x23;2 is found to have an error).</p>
</list-item>
<list-item>
<p>[0b ] Corresponding to each possible hypothesis (<inline-formula id="inf69">
<mml:math id="m73">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf70">
<mml:math id="m74">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>), the virtual spaces <inline-formula id="inf71">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are prepared by applying each constraint.</p>
</list-item>
<list-item>
<p>[1 ] Reinforcement learning <inline-formula id="inf72">
<mml:math id="m76">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is performed at the command base using the virtual space, generating &#x201c;the operation plan <inline-formula id="inf73">
<mml:math id="m77">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>&#x201d; to distinguish the hypotheses.</p>
</list-item>
<list-item>
<p>[2 ] The plan <inline-formula id="inf74">
<mml:math id="m78">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is performed by the agents. The command base compares the observed trajectory with that obtained in the virtual spaces in Step [1]. In the process, the hypothesis that yields the closest trajectory to that observed is identified as accurate <inline-formula id="inf75">
<mml:math id="m79">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
<list-item>
<p>[3 ] By taking the virtual space <inline-formula id="inf76">
<mml:math id="m80">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> as the identified hypothesis, another learning <inline-formula id="inf77">
<mml:math id="m81">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is performed to get the optimal plan <inline-formula id="inf78">
<mml:math id="m82">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> for the original mission (conveying items to goal posts).</p>
</list-item>
<list-item>
<p>[4 ] Agents are operated according to the plan <inline-formula id="inf79">
<mml:math id="m83">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</list-item>
</list>In this context, the term &#x201c;virtual space&#x201d; refers to an environment where physical computations are performed to simulate the movements of agents. In this study, it was implemented using Python. All learning processes and operations are simulated on a Linux server. The learning phase is the most time-intensive, requiring approximately 3&#xa0;h to complete using a single processor without any parallelization. For the learning phase, we implemented the PPO2 (proximal policy optimization, version&#xa0;2) algorithm <xref ref-type="bibr" rid="B30">Schulman et al., 2015</xref>) from the OpenAI Gym <xref ref-type="bibr" rid="B5">Brockman et al., 2016</xref>) library. Reinforcement learning <inline-formula id="inf80">
<mml:math id="m84">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> was benchmarked on the multilayer perceptron (MLP) and long short-time memory (LSTM) network structures, with performance compared between them. In the reinforcement learning described in [1], the state used comprised the positions of all agents, and the actions were defined as the direction of movement (x, y) for each agent. Conversely, in the reinforcement learning approach used in [3], the state included not only the positions of all agents but also the number of items each agent carried, the positions of all goal posts, and the number of items at each goal post. The actions remained the same, involving the direction of movement (x, y) for each agent. We did not conduct specific tuning for the hyperparameters as a default setting, as shown in <xref ref-type="table" rid="T1">Table 1</xref>. However, it has been pointed out that hyperparameter optimization (HPO) can significantly improve the performance of reinforcement learning (<xref ref-type="bibr" rid="B16">Henderson et al., 2018</xref>; <xref ref-type="bibr" rid="B34">Straub et al., 2020</xref>; <xref ref-type="bibr" rid="B4">Bihl et al., 2020</xref>; <xref ref-type="bibr" rid="B33">Snoek et al., 2012</xref>; <xref ref-type="bibr" rid="B8">Domhan et al., 2015</xref>; <xref ref-type="bibr" rid="B3">Bihl et al., 2022</xref>; <xref ref-type="bibr" rid="B39">Young et al., 2020</xref>). The comparison indicates that MLP performs better, with possible reasons given in the third paragraph of &#xa7;4. The results described herein were obtained by the MLP network structure. Notably, LSTM also generated almost identical agent behaviors to those exhibited by the MLP (possible reasons are given in the Section 3 of the <xref ref-type="sec" rid="s11">Supplementary Material</xref>.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>PPO2 hyperparameters used in training.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Parameter</th>
<th align="left">Value</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">gamma</td>
<td align="left">0.99</td>
</tr>
<tr>
<td align="left">n_steps</td>
<td align="left">128</td>
</tr>
<tr>
<td align="left">ent_coef</td>
<td align="left">0.01</td>
</tr>
<tr>
<td align="left">learning_rate</td>
<td align="left">0.00025</td>
</tr>
<tr>
<td align="left">vf_coef</td>
<td align="left">0.5</td>
</tr>
<tr>
<td align="left">max_grad_norm</td>
<td align="left">0.5</td>
</tr>
<tr>
<td align="left">lam</td>
<td align="left">0.95</td>
</tr>
<tr>
<td align="left">nminibatches</td>
<td align="left">4</td>
</tr>
<tr>
<td align="left">noptepochs</td>
<td align="left">4</td>
</tr>
<tr>
<td align="left">cliprange</td>
<td align="left">0.2</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The learning process <inline-formula id="inf81">
<mml:math id="m85">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in Step [1] is performed using two virtual spaces <inline-formula id="inf82">
<mml:math id="m86">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, corresponding to the two hypotheses:<disp-formula id="equ5">
<mml:math id="m87">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>Each <inline-formula id="inf83">
<mml:math id="m88">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> can take such possibilities under each constraint of its hypothesis (<italic>e.g.</italic>, <inline-formula id="inf84">
<mml:math id="m89">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> cannot be updated due to the actuator error). For an operation <inline-formula id="inf85">
<mml:math id="m90">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, the state on each virtual space is updated as<disp-formula id="equ6">
<mml:math id="m91">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>:</mml:mo>
<mml:mtable class="array">
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2192;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="center">
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2192;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
</mml:mtable>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>Taking the value function,<disp-formula id="e2">
<mml:math id="m92">
<mml:mrow>
<mml:mtable class="align" columnalign="left">
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>the two-fold <inline-formula id="inf86">
<mml:math id="m93">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-table is updated self-consistently as<disp-formula id="equ4">
<mml:math id="m94">
<mml:mrow>
<mml:mtable class="eqnarray-star">
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left"/>
<mml:mtd columnalign="left">
<mml:mi>Q</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left"/>
<mml:mtd columnalign="left">
<mml:mspace width="1em"/>
<mml:mo>&#x2b;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mi>F</mml:mi>
<mml:mfenced open="(" close="">
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mfenced>
<mml:mfenced open="" close=")">
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>Denoting the converged table as <inline-formula id="inf87">
<mml:math id="m95">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, the sequence of operations is obtained as given in <xref ref-type="disp-formula" rid="e1">Equation 1</xref>; in other words,<disp-formula id="e3">
<mml:math id="m96">
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>The operation sequence generates the two-fold sequence of (virtual) state evolutions as<disp-formula id="e4">
<mml:math id="m97">
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2192;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2192;</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>&#x2192;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>as shown in <xref ref-type="fig" rid="F3">Figure 3A</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Agent trajectories are driven by each operation plan consequently generated via reinforcement learning (with the MLP neural network structure), <inline-formula id="inf88">
<mml:math id="m98">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> first [panel <bold>(A)</bold>] and <inline-formula id="inf89">
<mml:math id="m99">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> [panel <bold>(B)</bold>]. The trajectories in <bold>(A)</bold> are the virtual states, <inline-formula id="inf90">
<mml:math id="m100">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> (two-fold), branching for Agent &#x23;2 with respect to the hypothesis. Those given in <bold>(B)</bold> are the real trajectories, as obtained via <xref ref-type="disp-formula" rid="e6">Equation 6</xref>. The labels (1)&#x2013;(3) indicate the agents, which move along the directions denoted by red arrows. Dotted circles indicate collisions between agents.</p>
</caption>
<graphic xlink:href="fcteg-05-1402621-g003.tif"/>
</fig>
<p>In Step [2], the agents operate according to the plan expressed by <xref ref-type="disp-formula" rid="e3">Equation 3</xref> to update (real) states as<disp-formula id="e5">
<mml:math id="m101">
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>to be observed by the command base. The base compares <xref ref-type="disp-formula" rid="e4">Equations 4</xref>, <xref ref-type="disp-formula" rid="e5">5</xref> to identify whether <inline-formula id="inf91">
<mml:math id="m102">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> or <inline-formula id="inf92">
<mml:math id="m103">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the cause of failure (<inline-formula id="inf93">
<mml:math id="m104">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in this case).</p>
<p>In Step [3], <inline-formula id="inf94">
<mml:math id="m105">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>-learning is performed for reward <inline-formula id="inf95">
<mml:math id="m106">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The reward function <inline-formula id="inf96">
<mml:math id="m107">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> calculates the sum of the individual agents&#x2019; rewards, where each agent gets a reward of <inline-formula id="inf97">
<mml:math id="m108">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>/</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>b</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> depending on its distance <inline-formula id="inf98">
<mml:math id="m109">
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> from the goal post. Thus, a higher reward is realized when the agent gets closer to the goalpost. By setting <inline-formula id="inf99">
<mml:math id="m110">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.01</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf100">
<mml:math id="m111">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>100.0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, a much higher reward value <inline-formula id="inf101">
<mml:math id="m112">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is obtained when the agent reaches the goal post <inline-formula id="inf102">
<mml:math id="m113">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. Although learning efficiency varies depending on the values of <inline-formula id="inf103">
<mml:math id="m114">
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf104">
<mml:math id="m115">
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, a relatively high efficiency was achieved by setting <inline-formula id="inf105">
<mml:math id="m116">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mo>&#x226b;</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. The operation sequence is then obtained as<disp-formula id="equ11">
<mml:math id="m117">
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>by which the states of the agents are updated as<disp-formula id="e6">
<mml:math id="m118">
<mml:mrow>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>as shown in <xref ref-type="fig" rid="F3">Figure 3B</xref>.</p>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<p>
<xref ref-type="fig" rid="F3">Figure 3A</xref> depicts two-fold trajectories, <xref ref-type="disp-formula" rid="e4">Equation 4</xref>, corresponding to the hypotheses <inline-formula id="inf106">
<mml:math id="m119">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf107">
<mml:math id="m120">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Although <inline-formula id="inf108">
<mml:math id="m121">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> for Agent &#x23;1, the branching <inline-formula id="inf109">
<mml:math id="m122">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2260;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> occurs for Agent &#x23;2 during operations. The branching process earns a score via the value function <inline-formula id="inf110">
<mml:math id="m123">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> in <xref ref-type="disp-formula" rid="e2">Equation 2</xref>, which indicates that the learning <inline-formula id="inf111">
<mml:math id="m124">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> was conducted properly. Thus, the ability to capture the difference between <inline-formula id="inf112">
<mml:math id="m125">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf113">
<mml:math id="m126">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> has been realized. The red dotted circle shown in (a) represents a collision between Agents &#x23;2 and &#x23;3, inducing the difference between <inline-formula id="inf114">
<mml:math id="m127">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf115">
<mml:math id="m128">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> (the trajectories only reflect the central positions of agents, while each agent has a finite radius similar to its size; therefore, the trajectories themselves do not intersect even when a collision occurs). In addition, the collision strategy is never generated in a rule-based manner, as the agents autonomously deduce their strategy via reinforcement learning.</p>
<p>Three square symbols (closed) situated at the edges of a triangle in <xref ref-type="fig" rid="F3">Figure 3</xref> represent the goalposts for the conveying mission. <xref ref-type="fig" rid="F3">Figure 3B</xref> shows the real trajectories for the mission, where the initial locations of the agents are the final locations in the panel (a). From their initial locations, Agents &#x23;1 and &#x23;3 immediately arrived at their goals to complete each mission and subsequently headed to Agent &#x23;2 for assistance. Meanwhile, Agent &#x23;2 attempted to reach its goal using its limited mobility; that is, only along the <inline-formula id="inf116">
<mml:math id="m129">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-axis. At the closest position, all three agents coalesced, and Agents &#x23;1 and &#x23;3 began pushing Agent &#x23;2 toward the goal. Though this behavior is simply the consequence of earning more from the value function <inline-formula id="inf117">
<mml:math id="m130">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, it appears as if Agent &#x23;1 wants to assist the malfunctioning agent cooperatively (a video of the behavior shown in <xref ref-type="fig" rid="F3">Figure 3B</xref> is available at the link <xref ref-type="bibr" rid="B15">Hayaschi, 2024</xref>). By identifying the constraint <inline-formula id="inf118">
<mml:math id="m131">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> for the agents in the learning phase <inline-formula id="inf119">
<mml:math id="m132">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, the subsequent learning phase <inline-formula id="inf120">
<mml:math id="m133">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is confirmed to generate the optimal operation plans to ensure that the team maximizes their benefit through cooperative behavior as if an autonomous decision has been made by the team.</p>
<p>During training, if the target reward is not reached in the given number of training sessions, the training process is reset to avoid being trapped by the local solution. In <xref ref-type="fig" rid="F4">Figure 4</xref>, the training curves of rejected trials are shown in blue, whereas the acceptable result is shown in red. Evidently, more learning processes were rejected in <inline-formula id="inf121">
<mml:math id="m134">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (right panel) than in <inline-formula id="inf122">
<mml:math id="m135">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (left panel). This indicates that it is a more challenging task to perform transport planning with three malfunctioning agents than to plan the action to pinpoint a hypothesis between any two. However, under more complex failure conditions, more learning is expected to be rejected for <inline-formula id="inf123">
<mml:math id="m136">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as well, as the number of possible hypotheses increases.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Learning curve evaluated for MLP network construction in terms of the reward function. Results for <inline-formula id="inf124">
<mml:math id="m137">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (left panel) and <inline-formula id="inf125">
<mml:math id="m138">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b2;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (right panel) are shown. Blue and red curves correspond to trajectories that did not reach the target reward at the end of training and those that successfully reached the target reward, respectively.</p>
</caption>
<graphic xlink:href="fcteg-05-1402621-g004.tif"/>
</fig>
<p>The performance of LSTM and MLP was compared in terms of the success rate for obtaining working trajectories to distinguish between the hypotheses. Notably, even when applying the well-converged <inline-formula id="inf126">
<mml:math id="m139">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-table, there is a certain rate required for the non-working trajectories to eliminate the difference between the hypotheses. This is a result of the stochastic nature of the policy in generating the trajectories. In the present work, we took 50 independent <inline-formula id="inf127">
<mml:math id="m140">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-tables, each of which was generated from scratch, and obtained 50 corresponding trajectories. The rate required to obtain the trajectories required to distinguish among the hypotheses amounts to 94% for a learning management system (LMS) and 78% for LSTM. In the present comparison, we used the same iteration steps as for <inline-formula id="inf128">
<mml:math id="m141">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-table convergence. Because LSTM has a more complex internal structure, its learning quality was expected to be relatively lower than that of an LMS for the common condition, and its performance rate was likewise expected to be lower. In other words, a higher iteration cost is required for LSTM to achieve performance comparable to an LMS. As such, the results shown in the main text are those obtained by the LMS, whereas those obtained by LSTM are presented in the <xref ref-type="sec" rid="s11">Supplementary Material</xref> for reference.</p>
<p>For a simulation in a virtual environment space, we must evaluate the distances between agents at every step. As this is a pairwise evaluation, its computational cost scales as <inline-formula id="inf129">
<mml:math id="m142">
<mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> for <inline-formula id="inf130">
<mml:math id="m143">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> agents. This cost scaling can be mitigated by using the domain decomposition method wherein each agent is evaluated according to its voxel, and the distance between agents is represented by that between corresponding voxels registered in advance. The corresponding cost scales linearly with <inline-formula id="inf131">
<mml:math id="m144">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> at a much faster rate than the naive <inline-formula id="inf132">
<mml:math id="m145">
<mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> evaluation method as the number of agents <inline-formula id="inf133">
<mml:math id="m146">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> increases.</p>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>Agents performing group missions can suffer from errors during missions. Multiple hypotheses may be devised to explain the causes of such errors. Cooperative behaviors, such as collisions between agents, can be deployed to identify said causes. We considered the autonomous planning of group behaviors via machine-learning techniques. Different hypotheses explaining the causes of the errors lead to different expected states as updated from the same initial state by the same operation. The larger the difference becomes, the better the corresponding operation plan can distinguish between the different hypotheses. In other words, the magnitude of the difference can be the value function to optimize the desired operation plan. Gradient-based optimization does not work well because a tiny fraction among the vast possible operations (<italic>e.g.</italic>, collisions) can capture the difference, leading to a sparse distribution of the finite value for the function. We discovered that reinforcement learning is the obvious choice for such problems. Notably, the optimal plan obtained via reinforcement learning was the operation that causes agents to collide with each other. To identify the causes of error using this plan, we developed a revised mission plan that incorporates the failure of another learning where the malfunctioning agent receives assistance from other agents. By identifying the cause of failure, the reinforcement learning process plans a revised mission plan that considers said failure to ensure an appropriate cooperation procedure. In this study, we conducted tests under the significant constraint that one of the three agents was malfunctioning. As described in &#xa7;1, the framework can generally be formulated for <inline-formula id="inf134">
<mml:math id="m147">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> agents. Future research will need to explore more detailed studies, including changes in the number of agents and variations in malfunctions. The findings presented in this paper provide initial insights into the capabilities of the proposed methods. Additional comparisons and results based on multiple trials, as well as comparisons with a greater number of baselines, are necessary to substantiate the conclusions of this study further.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s7">
<title>Author contributions</title>
<p>KU: conceptualization, methodology, validation, and writing&#x2013;review and editing. K-tH: data curation, visualization, and writing&#x2013;review and editing. TB: validation and writing&#x2013;review and editing. KH: writing&#x2013;review and editing. RM: funding acquisition, supervision, and writing&#x2013;original draft.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This work was supported by MEXT-KAKENHI (19H04692 and 16KK0097) and the Air Force Office of Scientific Research (AFOSR-AOARD/FA2386-17-1-4049; FA2386-19-1-4015).</p>
</sec>
<ack>
<p>The computations in this work were performed using the facilities at the Research Center for Advanced Computing Infrastructure at JAIST. RM is grateful for financial support from MEXT-KAKENHI (19H04692 and 16KK0097) and from the Air Force Office of Scientific Research (AFOSR-AOARD/FA2386-17-1-4049; FA2386-19-1-4015). The authors would like to thank Kosuke Nakano for his feedback as it significantly helped improve the overall paper. This work was cleared for public release under case AFRL-2023-3698. This work is the work of the authors and does not reflect any opinion or position of the U.S. government, the U.S. Air Force, or the Air Force Research Laboratory.</p>
</ack>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s11">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fcteg.2024.1402621/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fcteg.2024.1402621/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="Presentation1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Barto</surname>
<given-names>A. G.</given-names>
</name>
</person-group> (<year>2002</year>). &#x201c;<article-title>Innovation and intellectual property rights</article-title>,&#x201d; in <source>The handbook of brain theory and neural networks</source>. Editor <person-group person-group-type="editor">
<name>
<surname>Arbib</surname>
<given-names>M. A.</given-names>
</name>
</person-group> <edition>Second Edition</edition> (<publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>The MIT Press</publisher-name>), <fpage>963</fpage>&#x2013;<lpage>972</lpage>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bhatt</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Palenicek</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Belousov</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Argus</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Amiranashvili</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Brox</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Crossq: batch normalization in deep reinforcement learning for greater sample efficiency and simplicity</article-title>. <comment>
<italic>arXiv preprint arXiv:1902.05605</italic>
</comment>
</citation>
</ref>
<ref id="B3">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bihl</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Jones</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Farr</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Straub</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Bontempo</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Jones</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Assessing multi-agent reinforcement learning algorithms for autonomous sensor resource management</article-title>,&#x201d; in <source>Proceedings of the 55th Hawaii international Conference on system Sciences (Hawaii international conference on system Sciences)</source> (<publisher-loc>Honolulu, USA</publisher-loc>: <publisher-name>HICSS</publisher-name>). <pub-id pub-id-type="doi">10.24251/hicss.2022.695</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bihl</surname>
<given-names>T. J.</given-names>
</name>
<name>
<surname>Schoenbeck</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Steeneck</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jordan</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Easy and efficient hyperparameter optimization to address some artificial intelligence &#x201c;ilities&#x201d;</article-title>,&#x201d; in <source>53rd Hawaii international conference on system Sciences, HICSS 2020, maui, Hawaii, USA, january 7-10, 2020 ScholarSpace</source>, <fpage>1</fpage>&#x2013;<lpage>10</lpage>.</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brockman</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Cheung</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Pettersson</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Schulman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Openai gym</article-title>
</citation>
</ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Busoniu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Babuska</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>De Schutter</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2006</year>). &#x201c;<article-title>Multi-agent reinforcement learning: a survey</article-title>,&#x201d; in <source>2006 9th international conference on control, automation, robotics and vision</source>, <fpage>1</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1109/ICARCV.2006.345353</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Calvo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Dusparic</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Heterogeneous multi-agent deep reinforcement learning for traffic lights control</article-title>. <source>Proc. 26th Ir. Conf. Artif. Intell. Cogn. Sci.</source>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>.</citation>
</ref>
<ref id="B8">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Domhan</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Springenberg</surname>
<given-names>J. T.</given-names>
</name>
<name>
<surname>Hutter</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Speeding up automatic hyperparameter optimization of deep neural networks by extrapolation of learning curves</article-title>,&#x201d; in <source>Proceedings of the 24th international conference on artificial intelligence</source>, <fpage>3460</fpage>&#x2013;<lpage>3468</lpage>.</citation>
</ref>
<ref id="B9">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Finn</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Deep visual foresight for planning robot motion</article-title>,&#x201d; in <source>2017 IEEE international conference on robotics and automation (ICRA)</source>, <fpage>2786</fpage>&#x2013;<lpage>2793</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2017.7989324</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Foerster</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Nardelli</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Farquhar</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Afouras</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Torr</surname>
<given-names>P. H. S.</given-names>
</name>
<name>
<surname>Kohli</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). &#x201c;<article-title>Stabilising experience replay for deep multi-agent reinforcement learning</article-title>,&#x201d; in <source>Proceedings of the 34th international conference on machine learning</source>. <source>(PMLR), vol. 70 of <italic>Proceedings of machine learning research</italic>
</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Precup</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Teh</surname>
<given-names>Y. W.</given-names>
</name>
</person-group>, <fpage>1146</fpage>&#x2013;<lpage>1155</lpage>.</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Friston</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>The free-energy principle: a unified brain theory?</article-title> <source>Nat. Rev. Neurosci.</source> <volume>11</volume>, <fpage>127</fpage>&#x2013;<lpage>138</lpage>. <pub-id pub-id-type="doi">10.1038/nrn2787</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fujimoto</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>W.-D.</given-names>
</name>
<name>
<surname>Smith</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Precup</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Meger</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>For sale: state-action representation learning for deep reinforcement learning</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>36</volume>.</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gronauer</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Diepold</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Multi-agent deep reinforcement learning: a survey</article-title>. <source>Artif. Intell. Rev.</source> <volume>55</volume>, <fpage>895</fpage>&#x2013;<lpage>943</lpage>. <pub-id pub-id-type="doi">10.1007/s10462-021-09996-w</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gupta</surname>
<given-names>J. K.</given-names>
</name>
<name>
<surname>Egorov</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kochenderfer</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Cooperative multi-agent control using deep reinforcement learning</article-title>,&#x201d; in <source>Autonomous agents and multiagent systems</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Sukthankar</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Rodriguez-Aguilar</surname>
<given-names>J. A.</given-names>
</name>
</person-group> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer International Publishing</publisher-name>), <fpage>66</fpage>&#x2013;<lpage>83</lpage>.</citation>
</ref>
<ref id="B15">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Hayaschi</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Video for fig. 3</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.dropbox.com/s/feejhj389h7p215/robot2_labeled.mp4?dl=0">https://www.dropbox.com/s/feejhj389h7p215/robot2_labeled.mp4?dl&#x3d;0</ext-link>.</comment>
</citation>
</ref>
<ref id="B16">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Henderson</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Islam</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Bachman</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pineau</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Precup</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Meger</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Deep reinforcement learning that matters</article-title>,&#x201d; in <source>Aaai</source>.</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hero</surname>
<given-names>A. O.</given-names>
</name>
<name>
<surname>Cochran</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Sensor management: past, present, and future</article-title>. <source>IEEE Sensors J.</source> <volume>11</volume>, <fpage>3064</fpage>&#x2013;<lpage>3075</lpage>. <pub-id pub-id-type="doi">10.1109/JSEN.2011.2167964</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hiraoka</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Imagawa</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hashimoto</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Onishi</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tsuruoka</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Dropout q-functions for doubly efficient reinforcement learning</article-title>. <source>arXiv Prepr. arXiv:2110.02034</source>.</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Niu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Carrasco</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lennox</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Arvin</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Voronoi-based multi-robot autonomous exploration in unknown environments via deep reinforcement learning</article-title>. <source>IEEE Trans. Veh. Technol.</source> <volume>69</volume>, <fpage>14413</fpage>&#x2013;<lpage>14423</lpage>. <pub-id pub-id-type="doi">10.1109/TVT.2020.3034800</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>B.-Q.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>G.-Y.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Reinforcement learning neural network to the problem of autonomous mobile robot obstacle avoidance</article-title>. <source>2005 Int. Conf. Mach. Learn. Cybern.</source> <volume>1</volume>, <fpage>85</fpage>&#x2013;<lpage>89</lpage>. <pub-id pub-id-type="doi">10.1109/ICMLC.2005.1526924</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H. J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Planning and control for collision-free cooperative aerial transportation</article-title>. <source>IEEE Trans. Automation Sci. Eng.</source> <volume>15</volume>, <fpage>189</fpage>&#x2013;<lpage>201</lpage>. <pub-id pub-id-type="doi">10.1109/TASE.2016.2605707</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Malhotra</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Blasch</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Johnson</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>1997</year>). <article-title>Learning sensor-detection policies</article-title>. <source>Proc. IEEE 1997 Natl. Aerosp. Electron. Conf. NAECON 1997</source> <volume>2</volume>, <fpage>769</fpage>&#x2013;<lpage>776</lpage>. <pub-id pub-id-type="doi">10.1109/NAECON.1997.622727</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Malhotra</surname>
<given-names>R. P.</given-names>
</name>
<name>
<surname>Pribilski</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Toole</surname>
<given-names>P. A.</given-names>
</name>
<name>
<surname>Agate</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Decentralized asset management for collaborative sensing</article-title>,&#x201d; in <source>Micro- and nanotechnology sensors, systems</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Applications</surname>
<given-names>I. X.</given-names>
</name>
<name>
<surname>George</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Dutta</surname>
<given-names>A. K.</given-names>
</name>
<name>
<surname>Islam</surname>
<given-names>M. S.</given-names>
</name>
</person-group> (<publisher-loc>SPIE</publisher-loc>: <publisher-name>International Society for Optics and Photonics</publisher-name>), <volume>10194</volume>, <fpage>403</fpage>&#x2013;<lpage>414</lpage>.</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Effective multi-agent deep reinforcement learning control with relative entropy regularization</article-title>. <source>IEEE Trans. Automation Sci. Eng.</source>, <fpage>1</fpage>&#x2013;<lpage>15doi</lpage>. <pub-id pub-id-type="doi">10.1109/TASE.2024.3398712</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mnih</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Kavukcuoglu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Silver</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Rusu</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Veness</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bellemare</surname>
<given-names>M. G.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Human-level control through deep reinforcement learning</article-title>. <source>Nature</source> <volume>518</volume>, <fpage>529</fpage>&#x2013;<lpage>533</lpage>. <pub-id pub-id-type="doi">10.1038/nature14236</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Nachum</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Data-efficient hierarchical reinforcement learning</article-title>,&#x201d; in <source>Proceedings of the 32nd international conference on neural information processing systems</source>, <fpage>3307</fpage>&#x2013;<lpage>3317</lpage>.</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nguyen</surname>
<given-names>T. T.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>N. D.</given-names>
</name>
<name>
<surname>Nahavandi</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deep reinforcement learning for multiagent systems: a review of challenges, solutions, and applications</article-title>. <source>IEEE Trans. Cybern.</source> <volume>50</volume>, <fpage>3826</fpage>&#x2013;<lpage>3839</lpage>. <pub-id pub-id-type="doi">10.1109/TCYB.2020.2977374</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>O&#x2019;Keeffe</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tarapore</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Millard</surname>
<given-names>A. G.</given-names>
</name>
<name>
<surname>Timmis</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Adaptive online fault diagnosis in autonomous robot swarms</article-title>. <source>Front. Robotics AI</source> <volume>5</volume>, <fpage>131</fpage>. <pub-id pub-id-type="doi">10.3389/frobt.2018.00131</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Peng</surname>
<given-names>X. B.</given-names>
</name>
<name>
<surname>Andrychowicz</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zaremba</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Sim-to-real transfer of robotic control with dynamics randomization</article-title>,&#x201d; in <source>2018 IEEE international conference on robotics and automation (ICRA)</source>, <fpage>3803</fpage>&#x2013;<lpage>3810</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2018.8460528</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Schulman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mortiz</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Jordan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Trust region policy optimization</article-title>. <source>Proc. 32nd Int. Conf. Mach. Learn.</source> <volume>37</volume>, <fpage>1889</fpage>&#x2013;<lpage>1897</lpage>.</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Relative entropy regularized sample-efficient reinforcement learning with continuous actions</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source>, <fpage>1</fpage>&#x2013;<lpage>11doi</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2023.3329513</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Silver</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Schrittwieser</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Simonyan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Antonoglou</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Guez</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Mastering the game of go without human knowledge</article-title>. <source>Nature</source> <volume>550</volume>, <fpage>354</fpage>&#x2013;<lpage>359</lpage>. <pub-id pub-id-type="doi">10.1038/nature24270</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Snoek</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Larochelle</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Adams</surname>
<given-names>R. P.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Practical bayesian optimization of machine learning algorithms</article-title>. <source>Proc. 25th Int. Conf. Neural Inf. Process. Syst. -</source> <volume>2</volume>, <fpage>2951</fpage>&#x2013;<lpage>2959</lpage>.</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Straub</surname>
<given-names>K. M.</given-names>
</name>
<name>
<surname>Bontempo</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Jones</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Jones</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Farr</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Bihl</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Sensor resource management using multi-agent reinforcement learning with hyperparameter optimization</article-title>. <source>Tech. Rep.</source> <comment>White paper</comment>.</citation>
</ref>
<ref id="B35">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sutton</surname>
<given-names>R. S.</given-names>
</name>
<name>
<surname>Barto</surname>
<given-names>A. G.</given-names>
</name>
</person-group> (<year>2018</year>). <source>Reinforcement learning: an introduction</source>. <edition>second edn</edition>. <publisher-name>The MIT Press</publisher-name>.</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vinyals</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Babuschkin</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Czarnecki</surname>
<given-names>W. M.</given-names>
</name>
<name>
<surname>Mathieu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Dudzik</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chung</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Grandmaster level in starcraft ii using multi-agent reinforcement learning</article-title>. <source>Nature</source> <volume>575</volume>, <fpage>350</fpage>&#x2013;<lpage>354</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-019-1724-z</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Taylor</surname>
<given-names>M. E.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Improving reinforcement learning with confidence-based demonstrations</article-title>,&#x201d; in <source>Proceedings of the twenty-sixth international joint conference on artificial intelligence</source> (<publisher-loc>Darmstadt, Germany</publisher-loc>: <publisher-name>IJCAI-17</publisher-name>), <fpage>3027</fpage>&#x2013;<lpage>3033</lpage>. <pub-id pub-id-type="doi">10.24963/ijcai.2017/422</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xia</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>El Kamel</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Neural inverse reinforcement learning in autonomous navigation</article-title>. <source>Robotics Aut. Syst.</source> <volume>84</volume>, <fpage>1</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1016/j.robot.2016.06.003</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Young</surname>
<given-names>M. T.</given-names>
</name>
<name>
<surname>Hinkle</surname>
<given-names>J. D.</given-names>
</name>
<name>
<surname>Kannan</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ramanathan</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Distributed bayesian optimization of deep reinforcement learning algorithms</article-title>. <source>J. Parallel Distributed Comput.</source> <volume>139</volume>, <fpage>43</fpage>&#x2013;<lpage>52</lpage>. <pub-id pub-id-type="doi">10.1016/j.jpdc.2019.07.008</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ho</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>M. Q.-H.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Deep reinforcement learning supervised autonomous exploration in office environments</article-title>,&#x201d; in <source>2018 IEEE international conference on robotics and automation (ICRA)</source>, <fpage>7548</fpage>&#x2013;<lpage>7555</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2018.8463213</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>