<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurorobot.</journal-id>
<journal-title>Frontiers in Neurorobotics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurorobot.</abbrev-journal-title>
<issn pub-type="epub">1662-5218</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnbot.2024.1376215</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Curiosity model policy optimization for robotic manipulator tracking control with input saturation in uncertain environment</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Tu</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname> <given-names>Fujie</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2639072/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Xie</surname> <given-names>Zhongye</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Qin</surname> <given-names>Feiyan</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>College of Computer Science and Technology, Dongguan University of Technology</institution>, <addr-line>Dongguan</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>College of Outstanding Engineers, Dongguan University of Technology</institution>, <addr-line>Dongguan</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Keigo Watanabe, Okayama University, Japan</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Toshiyuki Yasuda, University of Toyama, Japan</p>
<p>Yuichiro Toda, Okayama University, Japan</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Fujie Wang <email>wangfujie128&#x00040;gmail.com</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>01</day>
<month>05</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>18</volume>
<elocation-id>1376215</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>01</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>03</day>
<month>04</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2024 Wang, Wang, Xie and Qin.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Wang, Wang, Xie and Qin</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>In uncertain environments with robot input saturation, both model-based reinforcement learning (MBRL) and traditional controllers struggle to perform control tasks optimally. In this study, an algorithmic framework of Curiosity Model Policy Optimization (CMPO) is proposed by combining curiosity and model-based approach, where tracking errors are reduced via training agents on control gains for traditional model-free controllers. To begin with, a metric for judging positive and negative curiosity is proposed. Constrained optimization is employed to update the curiosity ratio, which improves the efficiency of agent training. Next, the novelty distance buffer ratio is defined to reduce bias between the environment and the model. Finally, CMPO is simulated with traditional controllers and baseline MBRL algorithms in the robotic environment designed with non-linear rewards. The experimental results illustrate that the algorithm achieves superior tracking performance and generalization capabilities.</p></abstract>
<kwd-group>
<kwd>robotic manipulator</kwd>
<kwd>input saturation</kwd>
<kwd>uncertain environment</kwd>
<kwd>model-based reinforcement learning</kwd>
<kwd>intrinsic motivation</kwd>
<kwd>buffer schedule</kwd>
</kwd-group>
<counts>
<fig-count count="12"/>
<table-count count="0"/>
<equation-count count="29"/>
<ref-count count="41"/>
<page-count count="16"/>
<word-count count="9648"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Robotic manipulator trajectory tracking control as a classical control task has been broadly discussed in academia and industry. Previous knowledge of the kinematic and dynamic model of the robotic manipulator is required by most traditional controllers (Thuruthel et al., <xref ref-type="bibr" rid="B34">2019</xref>). Several estimation methods such as parameter identification (Zhang et al., <xref ref-type="bibr" rid="B40">2024</xref>) and state estimation (Wei et al., <xref ref-type="bibr" rid="B36">2023</xref>) have been proposed to alleviate the tolerance of the robot model. However, it is still essential to have knowledge of the fundamental model and recalibrate the parameters for various types of robotic manipulators (&#x000CD;&#x000F1;igo Elguea-Aguinaco et al., <xref ref-type="bibr" rid="B8">2023</xref>). Reinforcement Learning (RL) achieves maximum reward by training agents in an environment, without knowing the specific robot model. Model-Free Reinforcement Learning (MFRL) can accomplish these types of skills rather than just programming a fixed task through a procedure (Hu et al., <xref ref-type="bibr" rid="B13">2020</xref>). Therefore, controller tuning time can be saved by using an agent to operate. The primary limitations lie in the high cost of training due to model-free methods, requiring extensive data and inefficient interaction with the real world (Luo et al., <xref ref-type="bibr" rid="B24">2022</xref>).</p>
<p>The emerging model-based methods deliver higher sampling efficiency than model-free methods through learning a dynamic model (called the world model in this study) of the environment (Peng et al., <xref ref-type="bibr" rid="B29">2018</xref>; Luo et al., <xref ref-type="bibr" rid="B24">2022</xref>). Enormous amount of environmental simulation data is generated from the world model for agent training, which remarkably reduces the cost of data generation by interacting with real robotic manipulators (Hu et al., <xref ref-type="bibr" rid="B13">2020</xref>). This advantage offers compelling potential for applications in many complex environments, such as robotic manipulators (Pane et al., <xref ref-type="bibr" rid="B27">2019</xref>; Thuruthel et al., <xref ref-type="bibr" rid="B34">2019</xref>; Lu et al., <xref ref-type="bibr" rid="B23">2021</xref>). However, due to the robotic uncertainties, it is difficult to train the world model with limited prior knowledge. Furthermore, as elaborated in the study by Guo et al. (<xref ref-type="bibr" rid="B11">2021</xref>), another challenge for robot learning control may be input saturation and external disturbance, which are frequently encountered and unavoidable in mechanical systems. An effective way to mitigate these problems is to increase the agent&#x00027;s ability to explore. Well-optimized agents can discover the general shape of these challenges and provide control methods accordingly.</p>
<p>Intrinsic motivation maps novelty-based rewards via digging into implicit features of the environment to sweeten the efficiency of agent exploration in unknown environments (Sun et al., <xref ref-type="bibr" rid="B33">2022b</xref>). Curiosity-driven, as its offshoot, evaluates the novelty of states through self-supervised learning, which is later used to compute intrinsic rewards (Burda et al., <xref ref-type="bibr" rid="B3">2018a</xref>). This technique uses standalone modules that can be easily integrated into reinforcement learning frameworks. Hence, it has been widely discussed and applied to improve sampling efficiency (Sun et al., <xref ref-type="bibr" rid="B33">2022b</xref>). Various applications have demonstrated the effectiveness of curiosity-driven approaches in both dense and sparse reward scenarios (Gao et al., <xref ref-type="bibr" rid="B9">2023</xref>). Nevertheless, inappropriate ratio design can interfere with expressing extrinsic rewards in dense reward settings (Zhelo et al., <xref ref-type="bibr" rid="B41">2018</xref>). Some references have explored more complex relevance (Wu et al., <xref ref-type="bibr" rid="B37">2022</xref>) and contrastive learning (Sun et al., <xref ref-type="bibr" rid="B32">2022a</xref>) to mitigate the instability of pure-state features. Unfortunately, these attempts have limited effectiveness in enhancing robot environments that only provide physical information. Curiosity-driven expression of intrinsic rewards can be augmented by using dynamically shifting ratios instead of irrationally fixed designs, which require a rational evaluation metric.</p>
<p>In this study, MBRL is adopted to strengthen the efficiency of agent training. Meanwhile, integrating intrinsic curiosity with world models is proposed as a scheme to elevate performance in uncertain environments with robot input saturation. Based on the above, the Curiosity Model Policy Optimization (CMPO) framework is proposed, which efficiently blends curiosity with the world model by adaptively adjusting the changes in intrinsic rewards and reward ratios through rich evaluation metrics. The agent is responsible for configuring the controller gain to provide the necessary inputs to the robotic manipulator in the environment.</p>
<p>The CMPO algorithmic framework offers the benefits of fast data collection and curiosity-driven exploration for world model. This means that agents trained using this framework can work alongside traditional controllers to significantly enhance the performance of robotic manipulators. The main work and contributions are summarized below:</p>
<list list-type="bullet">
<list-item><p>Unlike the approach in which intrinsic rewards are always defined as positive in the study by Pathak et al. (<xref ref-type="bibr" rid="B28">2017</xref>), a positive&#x02013;negative intrinsic evaluation approach is defined, which adopts the world model to predict the effects of intrinsic rewards. Motivated by Haarnoja et al. (<xref ref-type="bibr" rid="B12">2019</xref>), by simply designing the intrinsic reward target, the adaptive ratio is proposed to be automatically tuned during curiosity exploration. These two modules work together to improve the sampling efficiency of the world model and agent.</p></list-item>
<list-item><p>Inspired by the FVI bound theory (Lai et al., <xref ref-type="bibr" rid="B19">2021</xref>) and the use of curiosity (Pathak et al., <xref ref-type="bibr" rid="B28">2017</xref>), the data novelty distance is designed to adjust the ratio of data sampled from the environment buffer and model buffer in each training episode, reducing the influence of external disturbance. Additionally, a non-linear reward system is created to enhance agent training. Sensible data buffer scheduling and the use of reward systems increase the training speed of the agent.</p></list-item>
<list-item><p>Building upon the foundation of MBPO (Janner et al., <xref ref-type="bibr" rid="B16">2021</xref>), CMPO overcomes the obstacles of world model fitting in uncertain environments with robot input saturation. Training performance comparison exhibits superior control performance and generalization ability. Ablation experiments demonstrated the help provided by each module. Moreover, parameter sensitivity experiments provide valuable references for CMPO hyperparameter selection.</p></list-item>
</list></sec>
<sec id="s2">
<title>2 Related works</title>
<sec>
<title>2.1 Model-based RL</title>
<p>Within the realm of MBRL, Dyna-Q-like methods (Peng et al., <xref ref-type="bibr" rid="B29">2018</xref>) constitute a distinct category. Rather than relying on a single model, ME-TRPO (Kurutach et al., <xref ref-type="bibr" rid="B18">2018</xref>) employs a B-length bootstrap model, which is trained in SLBO (Luo et al., <xref ref-type="bibr" rid="B25">2021</xref>), utilizing a multi-step L2 loss function. During the same period, PETS (Chua et al., <xref ref-type="bibr" rid="B7">2018</xref>) systematically interpreted the ensemble model as resolving aleatoric uncertainty. MBPO (Janner et al., <xref ref-type="bibr" rid="B16">2021</xref>) exploits their advantages to effectively improve model sampling efficiency by proving monotonic lower bound guarantees for branch prediction. Subsequently, BMPO (Lai et al., <xref ref-type="bibr" rid="B20">2020</xref>) further extends MBPO to bidirectional branching forecasts. AMPO (Shen et al., <xref ref-type="bibr" rid="B30">2020</xref>) reduces the mismatch between the model and environment data. Nevertheless, frequent updates distort the predictions of the network and the appropriate start-stop scheme is not given in the study by Luo et al. (<xref ref-type="bibr" rid="B24">2022</xref>). In this study, MBPO is utilized in the CMPO to ensure monotonic bounds, and scheduling theory (Lai et al., <xref ref-type="bibr" rid="B19">2021</xref>) is employed to ensure that training data can be sampled correctly.</p></sec>
<sec>
<title>2.2 RL with traditional controller</title>
<p>Combining reinforcement learning with controllers can facilitate task execution by exploiting their advantages simultaneously. By exporting the control gain for the traditional controller via RL, Wang et al. (<xref ref-type="bibr" rid="B35">2020</xref>) uses DQN to control the trajectory tracking of the mobile robot. Unlike the method of the study by Lu et al. (<xref ref-type="bibr" rid="B23">2021</xref>), the agent output is linearized with the controller output in the study by Xu et al. (<xref ref-type="bibr" rid="B38">2019</xref>) and a non-linear fuzzy reward system is designed for DDPG. Hu et al. (<xref ref-type="bibr" rid="B13">2020</xref>) further employs the RL approach with kernel model to elevate sampling efficiency and tracking capability. In this study, a similar view in the study by Xu et al. (<xref ref-type="bibr" rid="B38">2019</xref>) is utilized to design the non-linear rewards and follow the simulation experiment design methodology in the study by Hu et al. (<xref ref-type="bibr" rid="B13">2020</xref>).</p></sec>
<sec>
<title>2.3 Curiosity-driven exploration</title>
<p>Curiosity-driven exploration maps novelty-based intrinsic rewards by mining implicit features of the environment and the agent (Stadie et al., <xref ref-type="bibr" rid="B31">2015</xref>). At the outset, Li et al. (<xref ref-type="bibr" rid="B22">2020</xref>) allocates rewards through static and dynamic encoders. Instead of focusing on individual states (Burda et al., <xref ref-type="bibr" rid="B4">2018b</xref>), the approach in the study by Yang et al. (<xref ref-type="bibr" rid="B39">2019</xref>) evaluates intrinsic rewards by extracting characteristics of changes between states. It is worth noting that the ICM framework in the study by Pathak et al. (<xref ref-type="bibr" rid="B28">2017</xref>) concurrently employs forward and inverse dynamic encoding of state features, which significantly triggers intrinsic rewards for changes. By combining previous work, Huang et al. (<xref ref-type="bibr" rid="B14">2022</xref>) proposes a unified curiosity architecture. Recent research has focused on re-evaluating the novelty of states using novel methods such as context learning (Lee et al., <xref ref-type="bibr" rid="B21">2020</xref>), contrastive learning (Huang et al., <xref ref-type="bibr" rid="B14">2022</xref>; Sun et al., <xref ref-type="bibr" rid="B32">2022a</xref>), and relevance (Grill et al., <xref ref-type="bibr" rid="B10">2020</xref>; Wu et al., <xref ref-type="bibr" rid="B37">2022</xref>). These approaches are unable to assist with robot physical states that lack redundant information. Therefore, the classical self-supervised exploration (Pathak et al., <xref ref-type="bibr" rid="B28">2017</xref>; Li et al., <xref ref-type="bibr" rid="B22">2020</xref>) is employed for state feature extraction and evaluation of curiosity in this study.</p></sec></sec>
<sec id="s3">
<title>3 Problem description</title>
<p>In this section, the trajectory tracking problem is presented for an n-joint robotic manipulator. Consider a dynamic model for a robotic manipulator (Cao et al., <xref ref-type="bibr" rid="B5">2021</xref>) operating in an uncertain environment:</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>M</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>q</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>q</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x000A8;</mml:mo></mml:mover><mml:mo>&#x0002B;</mml:mo><mml:mi>C</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>q</mml:mi></mml:mstyle><mml:mo>,</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>q</mml:mi></mml:mstyle></mml:mrow><mml:mo>.</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>q</mml:mi></mml:mstyle></mml:mrow><mml:mo>.</mml:mo></mml:mover><mml:mo>&#x0002B;</mml:mo><mml:mi>G</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>q</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003C4;</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>d</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M2"><mml:mstyle mathvariant="bold"><mml:mi>q</mml:mi></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mstyle mathvariant="bold"><mml:mover accent="true"><mml:mrow><mml:mi>q</mml:mi></mml:mrow><mml:mo>&#x02022;</mml:mo></mml:mover></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo></mml:math></inline-formula> and <inline-formula><mml:math id="M3"><mml:mstyle mathvariant="bold"><mml:mover accent="true"><mml:mrow><mml:mi>q</mml:mi></mml:mrow><mml:mo>&#x000A8;</mml:mo></mml:mover></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> denote the joint angles, velocities, and accelerations, respectively; <bold>&#x003C4;</bold>(<italic>t</italic>)&#x02208;&#x0211D;<sup><italic>n</italic></sup> denote the joint torques; <bold><italic>d</italic></bold>(<italic>t</italic>)&#x02208;&#x0211D;<sup><italic>n</italic></sup> denote the external disturbance force. <italic>M</italic>(<bold><italic>q</italic></bold>)&#x02208;&#x0211D;<sup><italic>n</italic>&#x000D7;<italic>n</italic></sup> expresses the inertial matrix; <inline-formula><mml:math id="M4"><mml:mi>C</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>q</mml:mi></mml:mstyle><mml:mo>,</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>q</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02022;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> represents the centrifugal-Coriolis matrix; <italic>G</italic>(<bold><italic>q</italic></bold>)&#x02208;&#x0211D;<sup><italic>n</italic></sup> is the gravity potential force, and each of them consists of known and unknown parts (Lu et al., <xref ref-type="bibr" rid="B23">2021</xref>):</p>
<disp-formula id="E2"><label>(2)</label><mml:math id="M5"><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mi>M</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>M</mml:mi><mml:mi>&#x00394;</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>C</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mo>,</mml:mo><mml:mtext>&#x02009;</mml:mtext><mml:mover accent='true'><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mo>&#x002D9;</mml:mo></mml:mover><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mo>,</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mi>&#x00394;</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mo>,</mml:mo><mml:mtext>&#x02009;</mml:mtext><mml:mover accent='true'><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mo>&#x002D9;</mml:mo></mml:mover><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>G</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>G</mml:mi><mml:mi>&#x00394;</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>q</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>where (&#x000B7;)<sub>0</sub> denotes the known part and (&#x000B7;)<sub>&#x00394;</sub> denotes the unknown part, which is caused by environmental variations or measurement errors. In the actual training environment, all the dynamic parameters are unknown to the agent.</p>
<p>Define <bold><italic>x</italic><sub>1</sub></bold>(<italic>t</italic>) &#x0003D; <bold><italic>q</italic></bold>(<italic>t</italic>), <inline-formula><mml:math id="M6"><mml:mstyle mathvariant="bold"><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>q</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02022;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. Subsisting (2) into (1) and then rewriting it with <bold><italic>x</italic></bold> gives:</p>
<disp-formula id="E3"><label>(3)</label><mml:math id="M7"><mml:mrow><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mover accent='true'><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mo>&#x002D9;</mml:mo></mml:mover><mml:mn>1</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mn>2</mml:mn></mml:msub></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mover accent='true'><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mo>&#x002D9;</mml:mo></mml:mover><mml:mn>2</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mi>M</mml:mi><mml:mn>0</mml:mn><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003C4;</mml:mi></mml:mstyle><mml:mo>+</mml:mo><mml:msup><mml:mi>M</mml:mi><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>d</mml:mi></mml:mstyle><mml:mo>+</mml:mo><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>l</mml:mi></mml:mstyle></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M8"><mml:mstyle mathvariant="bold"><mml:mi>l</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mi>C</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>q</mml:mi></mml:mstyle><mml:mo>,</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>q</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02022;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>q</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02022;</mml:mo></mml:mover><mml:mo>-</mml:mo><mml:mi>G</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>q</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow></mml:msub><mml:mstyle mathvariant="bold"><mml:mi>&#x003C4;</mml:mi></mml:mstyle></mml:math></inline-formula> is the uncertainty modeling depends on the system state, where <inline-formula><mml:math id="M9"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>M</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:math></inline-formula>. The uncertainty <bold><italic>l</italic></bold> and the disturbance <bold><italic>d</italic></bold> are unknown and are assumed to be bounded (Guo et al., <xref ref-type="bibr" rid="B11">2021</xref>).</p>
<p>Trajectory tracking errors can then be defined as follows:</p>
<disp-formula id="E4"><label>(4)</label><mml:math id="M10"><mml:mrow><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>e</mml:mi></mml:mstyle><mml:mn>1</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mn>1</mml:mn></mml:msub><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mi>d</mml:mi></mml:msub></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>e</mml:mi></mml:mstyle><mml:mn>2</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mover accent='true'><mml:mi>x</mml:mi><mml:mo>&#x002D9;</mml:mo></mml:mover></mml:mstyle><mml:mn>1</mml:mn></mml:msub><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mover accent='true'><mml:mi>x</mml:mi><mml:mo>&#x002D9;</mml:mo></mml:mover></mml:mstyle><mml:mi>d</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>x</mml:mi></mml:mstyle><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mover accent='true'><mml:mi>x</mml:mi><mml:mo>&#x002D9;</mml:mo></mml:mover></mml:mstyle><mml:mi>d</mml:mi></mml:msub></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math></disp-formula>
<p>where <bold><italic>x</italic></bold><sub><italic>d</italic></sub> is the desired trajectory, <bold><italic>e</italic></bold><sub>1</sub> mean the tracking position error, and <bold><italic>e</italic></bold><sub>2</sub> indicate the tracking velocity error. <bold><italic>x</italic></bold><sub>1</sub>, <bold><italic>x</italic></bold><sub><italic>d</italic></sub>, <bold><italic>x</italic></bold><sub>2</sub>, and <bold>&#x01E8B;</bold><sub><italic>d</italic></sub> are assumed to be bounded in the control system and can be observed precisely.</p>
<p>Consider an optimal control problem with finite time horizon length <italic>N</italic>. Given an initial state <italic>s</italic><sub>0</sub>, the following minimum optimization problem is expected to solve (Brunke et al., <xref ref-type="bibr" rid="B2">2022</xref>):</p>
<disp-formula id="E5"><label>(5)</label><mml:math id="M11"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:msup><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>J</mml:mi></mml:mstyle><mml:mrow><mml:msup><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003C0;</mml:mi></mml:mstyle><mml:mo>*</mml:mo></mml:msup></mml:mrow></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mn>0</mml:mn></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mtext>&#x02009;&#x02009;</mml:mtext><mml:munder><mml:mrow><mml:mi>min</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mn>0</mml:mn><mml:mo>:</mml:mo><mml:mtext>&#x02009;</mml:mtext><mml:mi>N</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>J</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>u</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mi>min</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mi>t</mml:mi><mml:mrow><mml:mn>0</mml:mn><mml:mo>:</mml:mo><mml:mtext>&#x02009;</mml:mtext><mml:mi>N</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mstyle displaystyle='true'><mml:munder><mml:mo>&#x02211;</mml:mo><mml:mi>t</mml:mi></mml:munder><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>e</mml:mi></mml:mstyle><mml:mn>1</mml:mn></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>u</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:mstyle></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>s.t. <italic><bold>s</bold></italic><sub>t</sub> &#x0002B; 1 is derived recursively from <xref ref-type="disp-formula" rid="E3">Equation 3</xref>,</p>
<disp-formula id="E30"><mml:math id="M112"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>u</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003C4;</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>d</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <bold><italic>u</italic></bold> denotes the control input and is assumed to be bounded, i.e., <inline-formula><mml:math id="M12"><mml:mstyle mathvariant="bold"><mml:mi>u</mml:mi></mml:mstyle><mml:mo>&#x02264;</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>u</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula>, where <bold>&#x0016B;</bold> is a known vector; &#x003C0;<sup>&#x022C6;</sup> depicts the optimal policy. According to <xref ref-type="disp-formula" rid="E5">Equation 5</xref>, the objective is to design a controller that achieves the dynamic iteration process of <xref ref-type="disp-formula" rid="E3">Equation 3</xref> by minimizing the sum of the tracking error <bold><italic>e</italic></bold><sub>1</sub> and input cost <bold><italic>u</italic></bold>.</p></sec>
<sec id="s4">
<title>4 Preliminaries</title>
<sec>
<title>4.1 Reinforcement learning</title>
<p>As a continuous action space problem, robotic trajectory tracking control can be defined in a time-limited Markov decision process, which can be described by a quaternion set <inline-formula><mml:math id="M13"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="script">S</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mi mathvariant="script">A</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mi mathvariant="script">P</mml:mi></mml:mrow><mml:mo>,</mml:mo><mml:mrow><mml:mi mathvariant="script">R</mml:mi></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> (Mnih et al., <xref ref-type="bibr" rid="B26">2013</xref>; Brunke et al., <xref ref-type="bibr" rid="B2">2022</xref>; Kapturowski et al., <xref ref-type="bibr" rid="B17">2022</xref>). The state space <inline-formula><mml:math id="M14"><mml:mrow><mml:mi mathvariant="script">S</mml:mi></mml:mrow></mml:math></inline-formula> and the action space <inline-formula><mml:math id="M15"><mml:mrow><mml:mi mathvariant="script">A</mml:mi></mml:mrow></mml:math></inline-formula> are continuous, and the policy &#x003C0; provides a probability of transition from current state <inline-formula><mml:math id="M16"><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="script">S</mml:mi></mml:mrow></mml:math></inline-formula> with current action <inline-formula><mml:math id="M17"><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="script">A</mml:mi></mml:mrow></mml:math></inline-formula> to next state <inline-formula><mml:math id="M18"><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="script">S</mml:mi></mml:mrow></mml:math></inline-formula>: <italic>p</italic>(<italic>s</italic><sub><italic>t</italic>&#x0002B;1</sub>|<italic>a</italic><sub><italic>t</italic></sub>, <italic>s</italic><sub><italic>t</italic></sub>)= &#x003C0;(<italic>a</italic><sub><italic>t</italic></sub>|<italic>s</italic><sub><italic>t</italic></sub>) <inline-formula><mml:math id="M19"><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="script">P</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mi mathvariant="script">A</mml:mi></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. The emits result <inline-formula><mml:math id="M20"><mml:mi>r</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mi mathvariant="script">R</mml:mi></mml:mrow></mml:math></inline-formula> means the reward of each transit step, the sum of which denotes the reward of episodes. Our goal is to train a policy &#x003C0; to obtain the most expected rewards from every episode, which can be defined as follows:</p>
<disp-formula id="E6"><label>(6)</label><mml:math id="M21"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x022C6;</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mo class="qopname">arg</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">max</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:msub><mml:mrow><mml:mi>&#x1D53C;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:mi>&#x003C0;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>r</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where &#x003C0;<sup>&#x022C6;</sup> denotes the optimal policy.</p></sec>
<sec>
<title>4.2 Soft actor critic</title>
<p>Soft Actor Critic (SAC) (Haarnoja et al., <xref ref-type="bibr" rid="B12">2019</xref>) is an off-policy method based on the actor-critic algorithm. This approach uses the idea of maximum entropy to enhance the ability to explore policy:</p>
<disp-formula id="E7"><label>(7)</label><mml:math id="M22"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x022C6;</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mo class="qopname">arg</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">max</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mo>&#x02211;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x1D53C;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0007E;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C1;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>r</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mrow><mml:mi mathvariant="script">H</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003C0;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:mo>&#x02223;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where &#x003C1;<sub>&#x003C0;</sub> denotes the quaternion set of the Markov decision process. The actor network exports the action policy and updates its parameters &#x003B8;<sup><italic>a</italic></sup> using <xref ref-type="disp-formula" rid="E8">Equation 8</xref>.</p>
<disp-formula id="E8"><label>(8)</label><mml:math id="M23"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x1D53C;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0007E;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C1;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x1D53C;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0007E;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>&#x003B1;</mml:mi><mml:mo class="qopname">log</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02223;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>Q</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>Q</italic>(<bold>s</bold><sub><italic>t</italic></sub>, <bold>a</bold><sub><italic>t</italic></sub>) denotes the critic network, which adopts the <xref ref-type="disp-formula" rid="E9">Equation 9</xref> to update the parameter &#x003B8;<sup><italic>Q</italic></sup>.</p>
<disp-formula id="E9"><label>(9)</label><mml:math id="M24"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:msub><mml:mi>&#x02112;</mml:mi><mml:mrow><mml:msup><mml:mi>&#x003B8;</mml:mi><mml:mi>Q</mml:mi></mml:msup></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi mathvariant='double-struck'>E</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>~</mml:mo><mml:msub><mml:mi>&#x003C1;</mml:mi><mml:mi>&#x003C0;</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mo stretchy='false'>[</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>Q</mml:mi><mml:mrow><mml:msup><mml:mi>&#x003B8;</mml:mi><mml:mi>Q</mml:mi></mml:msup></mml:mrow></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x02212;</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mi>r</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;</mml:mtext><mml:mo>+</mml:mo><mml:mtext>&#x02009;&#x02009;</mml:mtext><mml:mi>&#x003B3;</mml:mi><mml:msub><mml:mi mathvariant='double-struck'>E</mml:mi><mml:mrow><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>~</mml:mo><mml:msub><mml:mi>&#x003C1;</mml:mi><mml:mi>&#x003C0;</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mo stretchy='false'>[</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mrow><mml:msup><mml:mi>&#x003B8;</mml:mi><mml:mover accent='true'><mml:mi>Q</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover></mml:msup></mml:mrow></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>]</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:msup><mml:mo stretchy='false'>)</mml:mo><mml:mn>2</mml:mn></mml:msup><mml:mo stretchy='false'>]</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>In practice, the target critic network <inline-formula><mml:math id="M26"><mml:mover accent="true"><mml:mrow><mml:mi>Q</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is used to approximate value network <italic>V</italic>(<bold>s</bold><sub><italic>t</italic>&#x0002B;1</sub>), which avoids overestimating the state value. Finally, the maximum entropy adaptive exploration of <xref ref-type="disp-formula" rid="E7">Equation 7</xref> is achieved by the adaptive temperature coefficient &#x003B1;.</p>
<disp-formula id="E10"><label>(10)</label><mml:math id="M27"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x1D53C;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0007E;</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C1;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mo class="qopname">log</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C0;</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>a</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02223;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mrow><mml:mi mathvariant="script">H</mml:mi></mml:mrow></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M28"><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:mrow><mml:mi mathvariant="script">H</mml:mi></mml:mrow></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:math></inline-formula> is a lower bound on the entropy expectation. Once the network parameters have been updated, all target networks are soft updated.</p></sec>
<sec>
<title>4.3 Dyna-Q-like MBRL</title>
<p>To simulate the real world, consider a dynamic model <inline-formula><mml:math id="M29"><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:mi mathvariant="script">S</mml:mi></mml:mrow><mml:mo>|</mml:mo><mml:mo>&#x0002B;</mml:mo><mml:mo>|</mml:mo><mml:mrow><mml:mi mathvariant="script">A</mml:mi></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:msup><mml:mo>&#x021A6;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:mi mathvariant="script">S</mml:mi></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:msup></mml:math></inline-formula>. For continuous states and actions, a probability distribution sampling method is proposed so that the world model can be output as a probabilistic form as follows:</p>
<disp-formula id="E11"><label>(11)</label><mml:math id="M30"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mtext>P</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>The learning objective of <inline-formula><mml:math id="M31"><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:math></inline-formula> is to fit the real-world model <italic>f</italic><sup>&#x022C6;</sup> and give unbias output <italic>s</italic><sub><italic>t</italic>&#x0002B;1</sub>, which is trained by using the dataset <inline-formula><mml:math id="M32"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> of length <italic>N</italic>, collected from the environment (Chua et al., <xref ref-type="bibr" rid="B7">2018</xref>; Kurutach et al., <xref ref-type="bibr" rid="B18">2018</xref>).</p></sec>
<sec>
<title>4.4 Controller saturation</title>
<p>To ensure accurate trajectory tracking, it is essential to provide sufficient input assistance with the robotic manipulator&#x00027;s pose transitions. However, saturated inputs can occasionally result in actual inputs being smaller than desired values, leading to poor tracking performance. Consequently, devising a controller that prevents such occurrences become an important issue. Suppose the joint torque <bold>&#x003C4;</bold> is the only control input, then the input saturation can be described as follows:</p>
<disp-formula id="E12"><label>(12)</label><mml:math id="M33"><mml:mrow><mml:msub><mml:mi>&#x003C4;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mi>&#x003C4;</mml:mi><mml:mrow><mml:mi>max</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mtd><mml:mtd columnalign='left'><mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x000A0;if&#x000A0;</mml:mtext><mml:msub><mml:mi>&#x003C4;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x02265;</mml:mo><mml:msub><mml:mi>&#x003C4;</mml:mi><mml:mrow><mml:mi>max</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mi>&#x003C4;</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:mtd><mml:mtd columnalign='left'><mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x000A0;if&#x000A0;</mml:mtext><mml:msub><mml:mi>&#x003C4;</mml:mi><mml:mrow><mml:mi>min</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0003C;</mml:mo><mml:msub><mml:mi>&#x003C4;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x0003C;</mml:mo><mml:msub><mml:mi>&#x003C4;</mml:mi><mml:mrow><mml:mi>max</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:msub><mml:mi>&#x003C4;</mml:mi><mml:mrow><mml:mi>min</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mtd><mml:mtd columnalign='left'><mml:mrow><mml:mo>,</mml:mo><mml:mtext>&#x000A0;if&#x000A0;</mml:mtext><mml:msub><mml:mi>&#x003C4;</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x02264;</mml:mo><mml:msub><mml:mi>&#x003C4;</mml:mi><mml:mrow><mml:mi>min</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow><mml:mo>,</mml:mo><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>...</mml:mn><mml:mo>,</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:math></disp-formula>
<p>where &#x003C4;<sub>max</sub> is the maximum of torque and &#x003C4;<sub>min</sub> is the minimum of torque.</p></sec></sec>
<sec id="s5">
<title>5 CMPO framework design</title>
<sec>
<title>5.1 Architecture summary</title>
<p>The CMPO framework contains an environment, a world model, an agent, and reply buffers. The control gain derived by the agent is input to the environment. The robotic manipulator will solve the dynamics based on the input and eventually output error and state information from the environment, which will stored in the environment buffer. Next, the environment buffer data are used to train both the world model and the curiosity network. Before training the world model, the environment buffer data are divided into training and evaluation datasets. After training the world model and the curiosity network, the world model generates simulation data and stores it in the world model buffer. Finally, the data from the environment buffer and model buffer are uniformly collected by the buffer scheduler, which is used to train the agent&#x00027;s actor network and Q network. The agent will continuously follow this loop to interact with the environment and train until convergence, and more detailed frameworks are shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. Specific implementation details are shown in Algorithm 1.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Schematic of the CMPO architecture. The agent interacts with the environment to store environment buffer data. The world model selects particles in the ensemble network to store generated model buffer data. In addition, the environment buffer data are used to train the curiosity network. Ultimately, the agent&#x00027;s actor network and Q network are trained using the buffer data evaluated via curiosity.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-g0001.tif"/>
</fig></sec>
<sec>
<title>5.2 MBRL design</title>
<p>The MBPO technique supplements the branch prediction and ensemble model based on the Dyna-Q-like MBRL method, which increases the model sampling efficiency and training speed (Janner et al., <xref ref-type="bibr" rid="B16">2021</xref>). Therefore, MBPO is used to design the world model under the CMPO algorithm. To better express and generalize the complex dynamic environment in a continuous Markov process, the Gaussian probabilistic neural network models are used to fit the environment to cope with the aleatoric uncertainty (Chua et al., <xref ref-type="bibr" rid="B7">2018</xref>). Departing from <xref ref-type="disp-formula" rid="E11">Equation 11</xref>, the world model will predict the next state and reward. Thus, the model can be rewritten as follows:</p>
<disp-formula id="E13"><label>(13)</label><mml:math id="M34"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:msub><mml:mover accent='true'><mml:mi>f</mml:mi><mml:mo>&#x002DC;</mml:mo></mml:mover><mml:mi>&#x003B8;</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>s</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>r</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x0007C;</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mtext>P</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>s</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>r</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x0007C;</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>;</mml:mo><mml:mi>&#x003B8;</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;</mml:mtext><mml:mo>=</mml:mo><mml:mi mathvariant='script'>N</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mi>&#x003B8;</mml:mi></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x003A3;</mml:mi><mml:mi>&#x003B8;</mml:mi></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M35"><mml:mrow><mml:mi mathvariant="script">N</mml:mi></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>&#x000B7;</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> denote the gaussian distribution.</p>
<p>Since the states in the actual environment are less dimensional, the feature network in curiosity is used along with the model to encode its states, augmenting the feature extraction capability. Given that the environment changes slightly from step to step, the deterministic network (Luo et al., <xref ref-type="bibr" rid="B24">2022</xref>) is applied to describe the predicted output of the world model. With the combination of the above changes, <xref ref-type="disp-formula" rid="E13">Equation 13</xref> can be rewritten to satisfy the iterative output of the next state in <xref ref-type="disp-formula" rid="E5">Equation 5</xref>:</p>
<disp-formula id="E14"><label>(14)</label><mml:math id="M36"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mover accent='true'><mml:mi>s</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>r</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>r</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>f</mml:mi><mml:mo>&#x002DC;</mml:mo></mml:mover><mml:mi>&#x003B8;</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>&#x00394;</mml:mi><mml:msub><mml:mover accent='true'><mml:mi>s</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>&#x00394;</mml:mi><mml:msub><mml:mover accent='true'><mml:mi>r</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x0007C;</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;</mml:mtext><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>r</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi mathvariant='script'>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mi>&#x003B8;</mml:mi></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>&#x003C6;</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x003A3;</mml:mi><mml:mi>&#x003B8;</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>&#x003C6;</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where &#x003C6;(&#x000B7;) denote the feature network and &#x00394;(&#x000B7;) means the magnitude of change.</p>
<p>Using the basis of <xref ref-type="disp-formula" rid="E14">Equation 14</xref> as the particle, the B-length bootstrap ensemble model <inline-formula><mml:math id="M37"><mml:mstyle mathvariant="bold"><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mstyle><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x022EF;</mml:mo><mml:mspace width="0.3em" class="thinspace"/><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> is adopted as the final world model, and then, a sum of negative log-likelihood loss is used as follows (Chua et al., <xref ref-type="bibr" rid="B7">2018</xref>) :</p>
<disp-formula id="E15"><label>(15)</label><mml:math id="M38"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:msub><mml:mi>&#x02112;</mml:mi><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:munderover><mml:mrow><mml:msup><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>&#x003C6;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mtext>T</mml:mtext></mml:msup></mml:mrow></mml:mstyle><mml:mo>&#x000B7;</mml:mo><mml:msubsup><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003A3;</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003B8;</mml:mi></mml:mstyle><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>&#x003C6;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;</mml:mtext><mml:mo>&#x000B7;</mml:mo><mml:mtext>&#x02009;&#x02009;</mml:mtext><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mi>&#x003BC;</mml:mi><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>&#x003C6;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>log</mml:mi><mml:mi>det</mml:mi><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003A3;</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003B8;</mml:mi></mml:mstyle></mml:msub><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>&#x003C6;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>In practice, the output is obtained randomly from a particle by designing short trajectory sampling frequencies (TS1 method; Chua et al., <xref ref-type="bibr" rid="B7">2018</xref> ). At every environmental timestep, the TS-1 method selects a new particle <inline-formula><mml:math id="M39"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> from the ensemble model <inline-formula><mml:math id="M40"><mml:mstyle mathvariant="bold"><mml:mover accent="true"><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mstyle></mml:math></inline-formula> to serve as the branching prediction output for the next timestep. Branch rollouts are used to recursively generate new data from the world model by means of <xref ref-type="disp-formula" rid="E14">Equation 14</xref> and store it in world model buffer <inline-formula><mml:math id="M41"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. Theory (Shen et al., <xref ref-type="bibr" rid="B30">2020</xref>) suggests that incremental branch lengths can ensure advancements in real training, and the model returns are increased enough to guarantee the progression of base returns. For the agent, the SAC (Haarnoja et al., <xref ref-type="bibr" rid="B12">2019</xref>) algorithm is used to optimize the policy with data collected from mixed <inline-formula><mml:math id="M42"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>e</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="M43"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>o</mml:mi><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>.</p></sec>
<sec>
<title>5.3 Curiosity model design</title>
<sec>
<title>5.3.1 Self-supervised exploration</title>
<p>The curiosity network provides the agent with additional intrinsic rewards to overcome the uncertainty of the environment through more exploration, whereas the networks look for potential patterns in the environment through self-supervised learning. The self-supervised exploration of curiosity is inspired by the earlier structure (Pathak et al., <xref ref-type="bibr" rid="B28">2017</xref>), which consists of a forward network and an inverse network. Due to the simplicity of the robot information, the states are encoded by the same feature networks before being fed into them.</p>
<p>The inverse network takes in the current and next state as inputs and outputs the current action. This allows the inverse network to learn how to derive the correct control gain. The loss function of the inverse network can be expressed as the mean squared error between the predicted and actual action (Pathak et al., <xref ref-type="bibr" rid="B28">2017</xref>):</p>
<disp-formula id="E16"><label>(16)</label><mml:math id="M44"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Inverse</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>B</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>a</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>a</mml:mi></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>B</italic> is the batch size of each train step. The main task of the inverse network is to learn potential feature encodings so that they provide more feature semantics in the inputs for both the world model and the forward network.</p>
<p>The input of the forward network is identical to the world model, which uses a residual network (Li et al., <xref ref-type="bibr" rid="B22">2020</xref>) to predict the feature encoding of the next state <inline-formula><mml:math id="M45"><mml:mover accent="true"><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold"><mml:mi>s</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. The disparity between the predicted and actual encodings is used to measure curiosity and is defined as the loss function of the forward network:</p>
<disp-formula id="E17"><label>(17)</label><mml:math id="M46"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Forward</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mo>||</mml:mo><mml:mover accent="true"><mml:mrow><mml:mi>&#x003C6;</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>s</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>&#x003C6;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>s</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>r</italic><sup><italic>o</italic></sup> is the output intrinsic reward, as measured by coded differences. Combining <xref ref-type="disp-formula" rid="E16">Equations 16</xref>, <xref ref-type="disp-formula" rid="E17">17</xref> mentions in the study by Pathak et al. (<xref ref-type="bibr" rid="B28">2017</xref>):</p>
<disp-formula id="E18"><label>(18)</label><mml:math id="M47"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Curiosity</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x003B2;</mml:mi><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Forward</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>&#x003B2;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">Inverse</mml:mtext></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where 1&#x02265;&#x003B2;&#x02265;0 is a scalar to balance <inline-formula><mml:math id="M48"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">Forward</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="M49"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">Inverse</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula>. Curiosity encourages the agent to look for new states (Li et al., <xref ref-type="bibr" rid="B22">2020</xref>), which improves the agent&#x00027;s sampling efficiency. However, in uncertain environments, pessimistic incentives can lead robots to undertake risky actions. Hence, providing a method to evaluate intrinsic rewards plays a crucial role. The next part of this study provides a valuation and conversion strategy for intrinsic rewards.</p></sec>
<sec>
<title>5.3.2 Positive&#x02013;negative intrinsic</title>
<p>To further upgrade the sampling efficiency in uncertain environments with input saturation, an approach is proposed to strengthen the expression of curiosity, which is distinguished as positive and negative.</p>
<p>Let <italic>X</italic> denotes the sample of quaternion corresponding to the Markov process, and its two subscripts (&#x000B7;)<sub><italic>e</italic></sub> and (&#x000B7;)<sub><italic>m</italic></sub> denote the samples of the quaternion in <inline-formula><mml:math id="M50"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="M51"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">model</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula>, respectively, then <italic>F</italic> denotes the distribution function of these two datasets. Assuming that the trained world model is plausible according to the theory by Janner et al. (<xref ref-type="bibr" rid="B16">2021</xref>). Given the inputs in <inline-formula><mml:math id="M52"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula>, the model output distribution will also follow the distribution pattern in <inline-formula><mml:math id="M53"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula>. It can be further deduced that the generated dataset <inline-formula><mml:math id="M54"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">model</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> should exhibit similarity to the distribution of <inline-formula><mml:math id="M55"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> as follows:</p>
<disp-formula id="E19"><label>(19)</label><mml:math id="M56"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02248;</mml:mo><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>After training in <inline-formula><mml:math id="M57"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula>, data input in <inline-formula><mml:math id="M58"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> will produce small intrinsic rewards based on the convergence law of curiosity. Concerning <inline-formula><mml:math id="M59"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="M60"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">model</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula>, it is known from <xref ref-type="disp-formula" rid="E19">Equation 19</xref> that curiosity still produces little reward when the world model produces the same distribution of data inputs. Based on the above, the current world model is ventured to use as a baseline for measuring curiosity.</p>
<p><inline-formula><mml:math id="M61"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> becomes <inline-formula><mml:math id="M62"><mml:msubsup><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> after collecting new data and no longer satisfies the <xref ref-type="disp-formula" rid="E19">Equation 19</xref>. Assuming that the distribution of <inline-formula><mml:math id="M63"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">model</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> follows the principle of being nearest and most similar to that of <inline-formula><mml:math id="M64"><mml:msubsup><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>. The predictive rewards of the world model are designed as a baseline so that the curiosity of <inline-formula><mml:math id="M65"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">model</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> is defined as positive. <inline-formula><mml:math id="M66"><mml:msubsup><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> compares actual and predicted reward differences to assess whether curiosity is positive or negative. Taken together, the output positive&#x02013;negative intrinsic reward can be rewritten as follows:</p>
<disp-formula id="E20"><label>(20)</label><mml:math id="M67"><mml:mrow><mml:msup><mml:mi>r</mml:mi><mml:mi>o</mml:mi></mml:msup><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:mi>sgn</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>r</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>r</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>&#x02016;</mml:mo><mml:mrow><mml:mover accent='true'><mml:mi>&#x003C6;</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003C6;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>&#x02016;</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn><mml:mn>2</mml:mn></mml:msubsup><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd columnalign='left'><mml:mrow><mml:mtext>if&#x000A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>r</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msubsup><mml:mi mathvariant='script'>D</mml:mi><mml:mrow><mml:mtext>env&#x000A0;</mml:mtext></mml:mrow><mml:mo>&#x02032;</mml:mo></mml:msubsup></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:mo>+</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000B7;</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>&#x02016;</mml:mo><mml:mrow><mml:mover accent='true'><mml:mi>&#x003C6;</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003C6;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>&#x02016;</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn><mml:mn>2</mml:mn></mml:msubsup><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd columnalign='left'><mml:mrow><mml:mtext>if&#x000A0;</mml:mtext><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>r</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mi mathvariant='script'>D</mml:mi><mml:mrow><mml:mtext>model&#x000A0;</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>where sgn(&#x000B7;) denote the signal function. In practice, the sign is used instead of the difference to account for the mismatch between the model and the environmental data (Shen et al., <xref ref-type="bibr" rid="B30">2020</xref>). Empirical evidence reveals that the sign is adequately robust to information bias. The specific flow of the algorithm is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. Intrinsic rewards serve as appraisals of the agent&#x00027;s exploration process and, together with extrinsic rewards for environmental interactions, constitute rewards for the actual output. However, a terrible proportion of intrinsic rewards in an intensive reward environment can lead to the nullification of extrinsic rewards. In the next section, a method for adjusting the amount of intrinsic and extrinsic rewards is developed.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Flowchart of the algorithm for the curiosity model. The world model outputs predicted current rewards and predicted next states after inputting feature codes and actions for the current state. Meanwhile, the forward and inverse networks output the predicted current state features and current actions, respectively. The sign of the difference between the predicted reward and the current reward is referred to as the assessment of curiosity. The predicted features along with the actual features computed as intrinsic rewards are finally used as current intrinsic rewards along with the evaluation results.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-g0002.tif"/>
</fig>
<p>Remark 1. Curiosity is sensitive to state changes due to its state novelty design (Burda et al., <xref ref-type="bibr" rid="B3">2018a</xref>). Research (Brunke et al., <xref ref-type="bibr" rid="B2">2022</xref>) suggests that curiosity can have a negative effect when bad states are received. <xref ref-type="disp-formula" rid="E20">Equation 20</xref> dynamically adjusts intrinsic rewards based on the quality of the state. When the environmental state is unfavorable, pessimistic curiosity hinders the exploration of the agent in that direction. Compared with the single method of evaluating state differences in the study by Pathak et al. (<xref ref-type="bibr" rid="B28">2017</xref>) and Burda et al. (<xref ref-type="bibr" rid="B4">2018b</xref>), positive&#x02013;negative intrinsic helps the agent explore in a relatively better direction which improves the model sampling efficiency.</p>
<p>Remark 2. In the statement above, the nearest-similarity principle refers to a reasonable assumption that the input data generated by the next episode are also available in <inline-formula><mml:math id="M68"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">model</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> and have been used to train the agent recently. The reasonableness of the assumption is based on the phenomenon that each agent training samples a much higher proportion of model data than environment data. Moreover, the world model extrapolates predictions from the initial states within <inline-formula><mml:math id="M69"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula>, underlining the similarity between the distribution of <inline-formula><mml:math id="M70"><mml:msubsup><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> and <inline-formula><mml:math id="M71"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">model</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula>.</p></sec>
<sec>
<title>5.3.3 Curiosity expansion</title>
<p>The ICM (Pathak et al., <xref ref-type="bibr" rid="B28">2017</xref>) moderates the impact of agent curiosity exploration using a fixed intrinsic reward ratio and is subsequently followed study by Burda et al. (<xref ref-type="bibr" rid="B4">2018b</xref>) and Yang et al. (<xref ref-type="bibr" rid="B39">2019</xref>). Nonetheless, the intrinsic reward decreases significantly with agent updates. It is difficult to tune an appropriate ratio for intrinsic reward. Instead of utilizing the fixed ratio, the acquisition of intrinsic rewards is treated as a constrained problem, where the mean value of intrinsic rewards is constrained, allowing intrinsic rewards to change adaptively during training. A similar approach is mentioned in the study by Boyd and Vandenberghe (<xref ref-type="bibr" rid="B1">2004</xref>) and Haarnoja et al. (<xref ref-type="bibr" rid="B12">2019</xref>), where it is applied to adaptively constrain the temperature coefficient in maximum entropy optimization. However, the curiosity-agent complexity association makes the optimization problematic.</p>
<p>In <xref ref-type="disp-formula" rid="E19">Equation 19</xref>, the relationship between the model data and the environment data has been mentioned, which has been shown the world model boosting for the agent. Thus, the maximum return of rewards from the agent in the intrinsic reward ratio constraint problem is equivalent to solving for the optimal fit of the curiosity and world model to the environment. Formally, the following constrained optimization problem is concerned:</p>
<disp-formula id="E21"><label>(21)</label><mml:math id="M72"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:munder><mml:mrow><mml:mi>min</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mover accent='true'><mml:mi>f</mml:mi><mml:mo>&#x002DC;</mml:mo></mml:mover></mml:mstyle><mml:mrow><mml:msub><mml:mi>b</mml:mi><mml:mrow><mml:mn>0</mml:mn><mml:mtext>&#x02009;</mml:mtext></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:mtext>&#x02009;</mml:mtext><mml:msub><mml:mi>b</mml:mi><mml:mi>T</mml:mi></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:msub><mml:mi mathvariant='double-struck'>E</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>~</mml:mo><mml:msub><mml:mi mathvariant='script'>D</mml:mi><mml:mrow><mml:mtext>env</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mover accent='true'><mml:mi>&#x003C6;</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x02212;</mml:mo><mml:mi>&#x003C6;</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;</mml:mtext><mml:mo>+</mml:mo><mml:msub><mml:mi mathvariant='double-struck'>E</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>~</mml:mo><mml:msub><mml:mi mathvariant='script'>D</mml:mi><mml:mrow><mml:mtext>env</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mover accent='true'><mml:mi>f</mml:mi><mml:mo>&#x002DC;</mml:mo></mml:mover></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x02212;</mml:mo><mml:msup><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>f</mml:mi></mml:mstyle><mml:mo>&#x022C6;</mml:mo></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;s.t.</mml:mtext><mml:msub><mml:mi mathvariant='double-struck'>E</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>~</mml:mo><mml:msub><mml:mi mathvariant='script'>D</mml:mi><mml:mrow><mml:mtext>env</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msup><mml:mi>r</mml:mi><mml:mi>o</mml:mi></mml:msup><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>&#x02265;</mml:mo><mml:mover accent='true'><mml:mtext>R</mml:mtext><mml:mo stretchy='true'>&#x000AF;</mml:mo></mml:mover></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M73"><mml:mover accent="true"><mml:mrow><mml:mtext>R</mml:mtext></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula> is the lower bound of target intrinsic reward and <italic>b</italic><sub>(&#x000B7;)</sub> denotes the training batch size index. There is no need to impose a constrained upper bound, as the output intrinsically rewards decreasing convergence during world model training. Similar to the method of Haarnoja et al. (<xref ref-type="bibr" rid="B12">2019</xref>), an iterative scheme is used to optimize from the last batch, modifying the constrained optimization of <xref ref-type="disp-formula" rid="E21">Equation 21</xref> to minimize its dual problem as follows:</p>
<disp-formula id="E22"><label>(22)</label><mml:math id="M74"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:munder><mml:mrow><mml:mi>max</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mi>&#x003B7;</mml:mi><mml:mrow><mml:msub><mml:mi>b</mml:mi><mml:mi>T</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mo>&#x02265;</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:munder><mml:munder><mml:mrow><mml:mi>min</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mover accent='true'><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>f</mml:mi></mml:mstyle><mml:mo stretchy='true'>&#x002DC;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:msub><mml:mi>b</mml:mi><mml:mi>T</mml:mi></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:msub><mml:mi mathvariant='double-struck'>E</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>~</mml:mo><mml:msub><mml:mi mathvariant='script'>D</mml:mi><mml:mrow><mml:mtext>env&#x000A0;</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mover accent='true'><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>f</mml:mi></mml:mstyle><mml:mo stretchy='true'>&#x002DC;</mml:mo></mml:mover><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x02212;</mml:mo><mml:msup><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>f</mml:mi></mml:mstyle><mml:mo>&#x022C6;</mml:mo></mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;&#x02009;</mml:mtext><mml:mo>+</mml:mo><mml:mtext>&#x02009;&#x02009;</mml:mtext><mml:msub><mml:mi mathvariant='double-struck'>E</mml:mi><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>~</mml:mo><mml:msub><mml:mi mathvariant='script'>D</mml:mi><mml:mrow><mml:mtext>env</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mo stretchy='false'>[</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:msub><mml:mi>&#x003B7;</mml:mi><mml:mrow><mml:msub><mml:mi>b</mml:mi><mml:mi>T</mml:mi></mml:msub></mml:mrow></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:msup><mml:mi>r</mml:mi><mml:mi>o</mml:mi></mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo stretchy='false'>]</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>&#x003B7;</mml:mi><mml:mrow><mml:msub><mml:mi>b</mml:mi><mml:mi>T</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mover accent='true'><mml:mtext>R</mml:mtext><mml:mo stretchy='true'>&#x000AF;</mml:mo></mml:mover></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where &#x003B7;<sub><italic>b</italic><sub><italic>T</italic></sub></sub> is the dual variable. The variable &#x003B7;<sub><italic>b</italic><sub><italic>T</italic></sub></sub> to be optimized in <xref ref-type="disp-formula" rid="E22">Equation 22</xref> corresponds to the ratio variable when the world model is trained to fit the environment. The optimal dual variables are addressed as follows:</p>
<disp-formula id="E23"><label>(23)</label><mml:math id="M75"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mo>&#x022C6;</mml:mo></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mo class="qopname">arg</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">max</mml:mo></mml:mrow><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow></mml:munder></mml:mstyle></mml:mtd><mml:mtd><mml:msub><mml:mrow><mml:mi>&#x1D53C;</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>s</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>s</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>a</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0007E;</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:msup><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>a</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B7;</mml:mi><mml:mover accent="true"><mml:mrow><mml:mtext>R</mml:mtext></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>An iterative approach is adopted to batch optimization for <inline-formula><mml:math id="M76"><mml:msubsup><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mo>&#x022C6;</mml:mo></mml:mrow></mml:msubsup></mml:math></inline-formula>, since approximate optimization using neural networks is still valid. <xref ref-type="disp-formula" rid="E23">Equation 23</xref> is rewritten as the optimized minimum loss function to facilitate consistent formatting:</p>
<disp-formula id="E24"><label>(24)</label><mml:math id="M77"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003B7;</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msup><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:mi>&#x003B7;</mml:mi><mml:mover accent="true"><mml:mrow><mml:mtext>R</mml:mtext></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>The ratio optimized by <inline-formula><mml:math id="M78"><mml:msubsup><mml:mrow><mml:mi>&#x003B7;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mo>&#x022C6;</mml:mo></mml:mrow></mml:msubsup></mml:math></inline-formula> may be lower than the manually designed one at the beginning. Nevertheless, as curiosity and ratio adapt competition converge, this method will still vary in the later steps, providing a boost for the agent. The detailed variation process is shown in Figure 7D.</p>
<p>After using the adaptive ratio variable, intrinsic reward <italic>r</italic><sup><italic>i</italic></sup> and total reward <italic>r</italic> are defined as follows:</p>
<disp-formula id="E25"><label>(25)</label><mml:math id="M79"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>&#x003B7;</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:msup><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi></mml:mrow></mml:msup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mi>r</mml:mi><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>e</mml:mi></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>r</italic><sup><italic>e</italic></sup> is the extrinsic reward that can be obtained from the environment. Substituting this new reward <italic>r</italic>, the critic network update <xref ref-type="disp-formula" rid="E9">Equation 9</xref> of SAC is made better. In addition, the critic network also allows for better evaluation and training of the actor network, increasing the overall training speed of the agent.</p>
<p>Remark 3. Differing from the previous intrinsic reward design (Pathak et al., <xref ref-type="bibr" rid="B28">2017</xref>), <italic>r</italic><sup><italic>i</italic></sup> in <xref ref-type="disp-formula" rid="E25">Equation 25</xref> has a lower bound <inline-formula><mml:math id="M80"><mml:mover accent="true"><mml:mrow><mml:mtext>R</mml:mtext></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula>. With the gradient optimization method, the ratio of intrinsic rewards is updated along with the curiosity network to ensure a pessimistic lower bound on intrinsic rewards. Furthermore, <italic>r</italic><sup><italic>i</italic></sup> converges toward reduction as indicated by the <xref ref-type="disp-formula" rid="E17">Equation 17</xref>. Through the interaction of decreasing convergence and expansive updates, intrinsic rewards remain influential despite the later stages of agent training. Subsequent experiments demonstrated the ability of the ratios to have a sustained impact, as shown in <xref ref-type="fig" rid="F7">Figure 7D</xref>.</p></sec>
<sec>
<title>5.3.4 Adaptive buffer schedule</title>
<p>Previous model optimization theories (Janner et al., <xref ref-type="bibr" rid="B16">2021</xref>; Luo et al., <xref ref-type="bibr" rid="B25">2021</xref>) provide a reliable basis for branch length enhancements, but they lack a method for selecting the buffer ratio. The buffer ratio helps the agent to sample a quota of environment data and model data, which are used for agent training. The environmental data are accurate, but the total amount of it is much less than the model data, which is not enough to train the agent. Conversely, the model data are sufficient, yet the insufficiently trained world model generates fatally biased data, leading to difficulties in convergence of agent training. The recent FVI bounds theory (Lai et al., <xref ref-type="bibr" rid="B19">2021</xref>) provides the missing ratio theory, showing that a dynamically increasing ratio of environment buffer is beneficial for agent augment. Inspired by this remark, the specific method for judging the ratio is provided under the curiosity model.</p>
<table-wrap position="float" id="T1">
<label>Algorithm 1</label>
<caption><p>CMPO training algorithm.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-i0001.tif"/>
</table-wrap>
<p>The curiosity that converged for <inline-formula><mml:math id="M101"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> before training may be different for <inline-formula><mml:math id="M102"><mml:msubsup><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, and thus, novelty ratio &#x003BE; is defined as follows:</p><disp-formula id="E26"><label>(26)</label><mml:math id="M103"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mi>&#x003BE;</mml:mi><mml:mo>=</mml:mo><mml:mi>clip</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>exp</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mo stretchy='false'>(</mml:mo><mml:mstyle displaystyle='true'><mml:munder><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:msub><mml:mi mathvariant='script'>D</mml:mi><mml:mrow><mml:mtext>env&#x000A0;</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:msup><mml:mi>r</mml:mi><mml:mi>o</mml:mi></mml:msup><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x02009;&#x02009;</mml:mtext><mml:mo>&#x02212;</mml:mo><mml:mstyle displaystyle='true'><mml:munder><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:msub><mml:mi mathvariant='script'>D</mml:mi><mml:mrow><mml:mtext>model&#x000A0;</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mrow><mml:msup><mml:mi>r</mml:mi><mml:mi>o</mml:mi></mml:msup></mml:mrow></mml:mstyle><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover accent='true'><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>s</mml:mi></mml:mstyle><mml:mo stretchy='true'>&#x0005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>a</mml:mi></mml:mstyle><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:msup><mml:mo stretchy='false'>)</mml:mo><mml:mn>2</mml:mn></mml:msup><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x003BE;</mml:mi><mml:mrow><mml:mi>min</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>&#x003BE;</mml:mi><mml:mrow><mml:mi>max</mml:mi></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where clip(&#x000B7;) denotes the clip function, its first parameter is the raw value, which will clip to the lower bound value &#x003BE;<sub>min</sub> and the upper bound value &#x003BE;<sub>max</sub>. The ratio &#x003BE; is defined as the proportion of data sampled from <inline-formula><mml:math id="M104"><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">D</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">env</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:math></inline-formula> in each agent training step.</p>
<p>Remark 4. Effectively organizing environment and model buffer data is a challenging endeavor (Lai et al., <xref ref-type="bibr" rid="B19">2021</xref>). Although the relevant theory (Lai et al., <xref ref-type="bibr" rid="B19">2021</xref>) proves the importance of scheduling and provides a method for calculating the ratio, it uses an additional agent implementation that requires additional training, resulting in high implementation costs. The novelty distance uses known world models and buffer data for calculation, without requiring additional computational costs. In addition, the corresponding experimental ratio changes exhibit similarity to the theory, as shown in <xref ref-type="fig" rid="F7">Figure 7C</xref>.</p>
<p>Remark 5. At the beginning of the training iterations, the agent will continue to explore novel data, and the difference in <xref ref-type="disp-formula" rid="E26">Equation 26</xref> will be amplified, hence the ratio will be at its minimum value. As the number of iterations increases and the curiosity model learns more data, the agent gradually encounters less novel data, and accordingly the difference in <xref ref-type="disp-formula" rid="E26">Equation 26</xref> is scaled down so that the ratio gradually increases to the maximum value. More detailed trends are shown in <xref ref-type="fig" rid="F7">Figure 7C</xref>. This ratio, which reduces the bias of the world model toward the agent, is in line with this theory&#x00027;s main thrust.</p></sec></sec></sec>
<sec id="s6">
<title>6 Controller design</title>
<p>Curiosity model can help the agent to solve complex dynamic problems, but in practice, further assurance is essential that the agent will explore the robotic manipulator in safety (Brunke et al., <xref ref-type="bibr" rid="B2">2022</xref>), which happens to be the strength of traditional controllers. PID is a simple model-free controller that can accomplish trajectory tracking tasks by giving suitable parameters (Wang et al., <xref ref-type="bibr" rid="B35">2020</xref>). In this section, the CMPO is combined with PID controllers to provide suitable control gains to make the controllers achieve performance even in uncertain environments with input saturation.</p>
<sec>
<title>6.1 Reward design</title>
<p>According to the definition of the dynamic equations shown in <xref ref-type="disp-formula" rid="E3">Equation 3</xref>, it is known that updating the system is related to position and velocity (Hu et al., <xref ref-type="bibr" rid="B13">2020</xref>). Hence, non-linear rewards are designed for the positional factors, while the auxiliary speed factors use linear rewards, which are designed as follows:</p>
<disp-formula id="E27"><label>(27)</label><mml:math id="M105"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>e</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mi>&#x003C2;</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>e</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003C3;</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>e</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msubsup><mml:mi>A</mml:mi><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>e</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <bold>&#x003C3;</bold> denotes the benefit threshold, &#x003C2;&#x0003E;0 denotes the sensitive scale, and <italic>A</italic> is a semi-positive definite constant matrix that denotes the weight of velocity in extrinsic reward. &#x003C3; is used to indicate the limit of positive and negative rewards, allowing the agent&#x00027;s capability to be as good as possible for that bound. To allow the agent to have a fast exploratory ramp-up period in rewards, &#x003C2; can set the ratio of increase so that the agent can obtain rewards quickly after a certain level of performance is achieved.</p></sec>
<sec>
<title>6.2 Tracking controller design</title>
<p>The general definition of PID controller is as follows (Xu et al., <xref ref-type="bibr" rid="B38">2019</xref>):</p>
<disp-formula id="E28"><label>(28)</label><mml:math id="M106"><mml:mrow><mml:mi>&#x003C4;</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:msub><mml:mi>K</mml:mi><mml:mi>p</mml:mi></mml:msub><mml:mi>e</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>+</mml:mo><mml:msub><mml:mi>K</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mstyle displaystyle='true'><mml:mrow><mml:msubsup><mml:mo>&#x0222B;</mml:mo><mml:mn>0</mml:mn><mml:mi>T</mml:mi></mml:msubsup><mml:mi>e</mml:mi></mml:mrow></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mi>d</mml:mi><mml:mi>t</mml:mi><mml:mo>+</mml:mo><mml:msub><mml:mi>K</mml:mi><mml:mi>d</mml:mi></mml:msub><mml:mfrac><mml:mi>d</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:mi>e</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math></disp-formula>
<p>Discretizing <xref ref-type="disp-formula" rid="E28">Equation 28</xref> and applying it to the robotic manipulator environment, it can be rewritten as follows:</p>
<disp-formula id="E29"><label>(29)</label><mml:math id="M107"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:mstyle mathvariant="bold-italic"><mml:mi>&#x003C4;</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo></mml:mtd><mml:mtd><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>e</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003B4;</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>e</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>&#x003B4;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>e</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mi>e</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>K</italic><sub><italic>p</italic></sub>, <italic>K</italic><sub><italic>i</italic></sub>, and <inline-formula><mml:math id="M108"><mml:msub><mml:mrow><mml:mi>K</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> denote the proportional, integral, and differential gains of the n-joint dimension, respectively, and the integral term is approximated by proportional smoothing, which has a proportional value &#x003B4; that denotes the memory of past errors. <xref ref-type="disp-formula" rid="E29">Equation 29</xref> is used as a traditional control, which becomes the link between the action inputs and the conversion of the inputs from the robotic manipulator system.</p>
<p>The <bold>&#x003C4;</bold> and <bold><italic>d</italic></bold> generated by the environment are iterated according to <xref ref-type="disp-formula" rid="E5">Equation 5</xref>, and new environment data are generated as a means of cyclic execution in the environment. In the CMPO framework, the environment steps are completed, and data are collected by interacting with the environment, inputting the current state <italic>s</italic><sub><italic>t</italic></sub> &#x0003D; {<bold><italic>x</italic></bold>, <bold><italic>x</italic></bold><sub><italic>d</italic></sub>, <bold>&#x01E8B;</bold>, <bold>&#x01E8B;</bold><sub><italic>d</italic></sub>} to the agent and obtaining the action <italic>a</italic><sub><italic>t</italic></sub> &#x0003D; {<italic>K</italic><sub><italic>p</italic></sub>, <italic>K</italic><sub><italic>i</italic></sub>, <italic>K</italic><sub><italic>d</italic></sub>} as controller gain input to the traditional controller.</p>
<p>After the PID has obtained the controller gain, the torque is calculated based on the error input serves as the input of the force at each joint of the robotic manipulator. The robot manipulator calculates the position for the next step, which is used to determine the next state and the new error. Finally, the reward system gives an evaluation and updates the critic network and actor network sequentially. Figure 3 contains specific details regarding the control of environmental and updating cycles.</p>
<p>More details of the parameter update are shown in <xref ref-type="fig" rid="F1">Figures 1</xref>, <xref ref-type="fig" rid="F3">3</xref>. Once the environment and model buffers have accumulated enough data, the agent will be updated in steps. Similarly, the world model is updated in episodes with all the data from the environment buffer. These two processes make up the update cycle of the controller. The pseudocode for the CMPO controller is shown in Algorithm 1.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Schematic diagram of how robotic manipulator control works with reinforcement learning combined with a conventional controller. The PID controller calculates the joint torque based on the control gain of the actor network. Next, the robotic manipulator receives the torque and obtains the next state information. This information is used to calculate the next control gain and the error from the actual trajectory position. The input error is then utilized by the reward system to update the critic network, which, in turn, updates the actor network.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-g0003.tif"/>
</fig></sec></sec>
<sec id="s7">
<title>7 Experimental results and analysis</title>
<sec>
<title>7.1 Environmental configurations</title>
<p>Based on the scheme and methodology shown in the study by Hu et al. (<xref ref-type="bibr" rid="B13">2020</xref>), a simulation environment is set up for the tracking control of a two-link (2-DOF) manipulator in an uncertain environment with robot input saturation. The format of the parameters, inertial matrix, centrifugal and Coriolis force matrix, gravitational force effect of the robot, and their internal specific parameters are shown in <xref ref-type="supplementary-material" rid="SM1">Appendix A</xref>.</p>
<p>The control performance experiments of CMPO is compared with cutting-edge controllers. Advanced controllers not only use robotic models to improve control accuracy but also counteract environmental uncertainties through a sliding mode robust approach (Islam and Liu, <xref ref-type="bibr" rid="B15">2011</xref>; Chertopolokhov et al., <xref ref-type="bibr" rid="B6">2023</xref>). Unlike model-free controllers, model-based controller performance relies on the accuracy of the robotic model. In uncertain environments, robotic arm models potentially exist errors. Thus, different robotic model errors are employed in advanced controllers to compare the control performance with the CMPO algorithm.</p>
<p>In the environment, the individual states of the parameters of the robotic manipulator will be initially set as <italic>q</italic><sub>1</sub>(0) &#x0003D; <italic>q</italic><sub>2</sub>(0) &#x0003D; &#x02212;0.5 and <inline-formula><mml:math id="M109"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>q</mml:mi></mml:mrow><mml:mo>&#x02022;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>q</mml:mi></mml:mrow><mml:mo>&#x02022;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula>. The curves required to be tracked are designed as <italic>q</italic><sub><italic>d</italic>1</sub>(<italic>t</italic>) &#x0003D; sin(<italic>t</italic>) and <italic>q</italic><sub><italic>d</italic>2</sub>(<italic>t</italic>) &#x0003D; cos(<italic>t</italic>), thereby the tracking velocity is designed as <inline-formula><mml:math id="M110"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>q</mml:mi></mml:mrow><mml:mo>&#x02022;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo class="qopname">cos</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> and <inline-formula><mml:math id="M111"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>q</mml:mi></mml:mrow><mml:mo>&#x02022;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mo class="qopname">sin</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. Then, the extrinsic reward is set as &#x003C3; &#x0003D; 0.35, &#x003C2; &#x0003D; 2.0, and <italic>A</italic> &#x0003D; <bold>0</bold>, and the parameter of the PID controller is set as &#x003B4; &#x0003D; 0.5. The step size is limited to 5, 000 for each episode of the environment, and the time variance of each step is limited to 0.01 s. Each episode of the environment simulates real-time information about the robot&#x00027;s trajectory for 50 s in agent training and the same step gap for 30 s in checkpoint simulation.</p>
<p>Depending on the experiment, the environments are categorized into three types, which are: basic, small-change, and large-change environments. Each of them adds saturation and disturbance, which are set as <bold>&#x003C4;</bold><sub>max</sub> &#x0003D; 60, <bold>&#x003C4;</bold><sub>min</sub> &#x0003D; &#x02212;60, and |<bold><italic>d</italic></bold>| &#x02264; <bold>2</bold>. Each of the three settings adds saturation and disturbance, with disturbance being uniform noise occurring 75% of the time. For more specific settings of the robot parameters in each environment, refer to <xref ref-type="supplementary-material" rid="SM1">Appendix A</xref>.</p></sec>
<sec>
<title>7.2 Evaluation of algorithm</title>
<sec>
<title>7.2.1 Generalization ability</title>
<p>In this experiment, the performance of CMPO is compared with traditional PID (Wang et al., <xref ref-type="bibr" rid="B35">2020</xref>) controller.</p>
<p>The trained CMPO and the completed parameter-tuning PID algorithms are first simulated in the basic environment. The results are presented in <xref ref-type="fig" rid="F4">Figure 4</xref>. It can be observed from <xref ref-type="fig" rid="F4">Figures 4A, B</xref> that agent boosts the speed of the convergence afterward by sacrificing the performance of link 1 at the beginning. With this policy, it is obvious from <xref ref-type="fig" rid="F4">Figures 4C, D</xref> that the converged tracking error CMPO is significantly better than that of PID.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Comparison of tracking performance between CMPO and conventional PID controllers in the basic environment. <bold>(A, B)</bold> represent the curves of the position tracking over time for joint 0 and joint 1; <bold>(C, D)</bold> represent the curves of the tracking error over time for joint 0 and joint 1; <bold>(E, F)</bold> represent the curves of the input torque magnitude over time for joint 0 and joint 1.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-g0004.tif"/>
</fig>
<p>The same models and parameters are then applied to simulate the small-change environment. This experiment compares their generalization abilities in a robotic environment with high tolerance. The results are shown in <xref ref-type="fig" rid="F5">Figure 5</xref>. The tracking trajectories and errors of <xref ref-type="fig" rid="F5">Figures 5A</xref>&#x02013;<xref ref-type="fig" rid="F5">D</xref> in this environment are similar to those of the basic environment, demonstrating the admirable generalization capabilities of traditional controllers. However, the input costs are shown to be different in <xref ref-type="fig" rid="F5">Figures 5E</xref>, <xref ref-type="fig" rid="F5">F</xref>, with CMPO having lower input than PID. This comparison empirically suggests that the agent input policy further enhances the control performance.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>Comparison of tracking performance between CMPO and conventional PID controllers in the small-change environment with the same parameters in basic environment. The display content of each component image is similar to that of <xref ref-type="fig" rid="F4">Figure 4</xref>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-g0005.tif"/>
</fig>
<p>Finally, a CMPO with the same parameters is simulated in the big-change environment, which is then compared with the fine-tuned CMPO. The tracking results are shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. The original CMPO takes longer to converge in performance, which is shown in <xref ref-type="fig" rid="F6">Figure 6A</xref>. Based on the data presented in <xref ref-type="fig" rid="F6">Figure 6E</xref>, it is evident that the cause of the issue lies in the input saturation being more severe. Fine-tuning CMPO requires only approximately 20% of the original training cost, as shown in <xref ref-type="fig" rid="F7">Figure 7B</xref>, demonstrating the value of generalization.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>Comparison of tracking performance between CMPO and fine-tuning CMPO in the big-change environment. The display content of each component image is similar to that of <xref ref-type="fig" rid="F4">Figure 4</xref>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-g0006.tif"/>
</fig>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>Visualization of agents training metrics in the basic environment. <bold>(A)</bold> is the reward curve for CMPO compared with baseline RL, obtained by taking the mean and standard deviation of five times training in the basic environment; <bold>(B)</bold> compares the reward curves trained in the basic environment with that from fine-tuning the model in big change; <bold>(C)</bold> is a visualization of the basic environmental reward curve vs. the adaptive environment buffer ratio; <bold>(D)</bold> visualizes the agents using fixed and adaptive ratios to output intrinsic rewards in the basic environment.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-g0007.tif"/>
</fig></sec>
<sec>
<title>7.2.2 Training performance</title>
<p>In this experiment, the CMPO is performed with other RL algorithms, including SAC, MBPO, and AMPO in the basic environment. The detailed parameter settings are shown in <xref ref-type="supplementary-material" rid="SM1">Appendix B</xref>.</p>
<p>The reward curves are shown in <xref ref-type="fig" rid="F7">Figure 7A</xref>. It can be observed that CMPO trains faster than all baselines in the basic environment, indicating that the curiosity model plays an important role in enhancing sampling efficiency and robustness. In addition, the trajectory tracking performance of the algorithms is compared and can be accessed in <xref ref-type="fig" rid="F8">Figure 8</xref>. It is found that CMPO tracking outperforms all the baseline algorithms while achieving asymptotic performance slightly better than that of SAC.</p>
<fig id="F8" position="float">
<label>Figure 8</label>
<caption><p>Comparison of the tracking effectiveness of CMPO with other baseline RL in the basic environment. <bold>(A, B)</bold> represent the curves of the position tracking over time for joint 0 and joint 1; <bold>(C, D)</bold> respectively represent the curves of the tracking error over time for joint 0 and joint 1.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-g0008.tif"/>
</fig>
<p>To further validate the effect of the ratios, the simultaneous change in the ratios with the rewards is shown in <xref ref-type="fig" rid="F7">Figures 7C</xref>, <xref ref-type="fig" rid="F7">D</xref>. In each episode of agent training, <xref ref-type="fig" rid="F7">Figure 7C</xref> shows the variation of the environment buffer sampling ratio with rewards. The ratios reflect a general upward trend throughout training. But in detail, a trend of decreasing rewards is repeatedly predicted to elevate the ratios, which is consistent with the theoretical remarks (Lai et al., <xref ref-type="bibr" rid="B19">2021</xref>). The intrinsic reward for curiosity with a fixed ratio has much larger outputs than the adaptive ratio at the beginning, as shown in <xref ref-type="fig" rid="F7">Figure 7D</xref>. In addition, the adaptive ratio still maintains an effective curiosity reward output in the later stages.</p>
<p><xref ref-type="fig" rid="F9">Figure 9</xref> illustrates the control performance of the advanced controller with different robotic model errors and the CMPO in the basic environment. The advanced controller demonstrates optimal control performance in error-free conditions, as shown in <xref ref-type="fig" rid="F9">Figures 9A</xref>&#x02013;<xref ref-type="fig" rid="F9">D</xref>. However, as shown in <xref ref-type="fig" rid="F9">Figures 9E</xref>, <xref ref-type="fig" rid="F9">F</xref>, the advanced controller exhibits the highest input fluctuation and associated input cost at this juncture. Conversely, the CMPO approach achieves comparable control performance while minimizing input costs. Due to the advanced controller&#x00027;s high sensitivity to variations in the robotic model, its control performance diminishes with increasing model error, ultimately falling behind that of the CMPO algorithm.</p>
<fig id="F9" position="float">
<label>Figure 9</label>
<caption><p>Comparison of the tracking effectiveness of CMPO in the same parameters with cutting-edge controller in the basic environment. The display content of each component image is similar to that of <xref ref-type="fig" rid="F4">Figure 4</xref>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-g0009.tif"/>
</fig>
<p>In uncertain environments, low-frequency or high-frequency disturbance inputs can reveal controller&#x00027;s immunity to interference. <xref ref-type="fig" rid="F10">Figure 10</xref> depicts the control performance of the CMPO algorithm under varying disturbance probabilities. <xref ref-type="fig" rid="F10">Figures 10A</xref>&#x02013;<xref ref-type="fig" rid="F10">D</xref> demonstrate that disturbances within bounded ranges exhibit negligible impact on control performance, irrespective of their frequency characteristics. Notably, <xref ref-type="fig" rid="F10">Figures 10E</xref>, <xref ref-type="fig" rid="F10">F</xref> reveal discernible differences in moment inputs generated under differing disturbance scenarios. Specifically, with increasing disturbance probability, greater input magnitudes are employed to mitigate the disturbance effects.</p>
<fig id="F10" position="float">
<label>Figure 10</label>
<caption><p>Comparison of the tracking effectiveness of CMPO in the same parameters with different input disturbance frequency in the basic environment. The display content of each component image is similar to that of <xref ref-type="fig" rid="F4">Figure 4</xref>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-g0010.tif"/>
</fig></sec>
<sec>
<title>7.2.3 Ablation experiment</title>
<p>To demonstrate the effect of model enhancement across modules, an ablation experiment is performed. The result is shown in <xref ref-type="fig" rid="F11">Figure 11</xref>. Reward convergence is less stable when curiosity expansion is no longer applied, although the reward ascends slightly faster than before. The rate of convergence of rewards is substantially reduced if positive&#x02013;negative intrinsic is removed. It also causes the same distress when the buffer scheduler is hidden. Moreover, the convergence results deteriorated. This experiment demonstrates the beneficial effects of all three modules in CMPO, improving the sampling efficiency of the model.</p>
<fig id="F11" position="float">
<label>Figure 11</label>
<caption><p>The results of the ablation experiment of CMPO. After removing the individual modules of CMPO separately, the training is performed in the basic environment with a step size of 100K. Comparison of ablated modules includes curiosity expansion, positive-negative curiosity, and adaptive scheduling buffer ratio.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-g0011.tif"/>
</fig></sec>
<sec>
<title>7.2.4 Hyperparametric sensitivity experiment</title>
<p>Parameter sensitivity experiments offer valuable insights into the impact of parameter variations on the efficacy of model training. <xref ref-type="fig" rid="F12">Figure 12</xref> shows the simultaneous testing of the agent learning rate &#x003BB;<sub><italic>a</italic></sub>, &#x003BB;<sub><italic>Q</italic></sub>, &#x003BB;<sub>&#x003B1;</sub> (denoted as <monospace>agent_lr</monospace>), the curiosity network learning rate &#x003BB;<sub>icm</sub> (denoted as <monospace>curiosity_lr</monospace>), and the world model learning rate &#x003BB;<sub>&#x003B8;</sub> (denoted as <monospace>model_lr</monospace>). As shown in <xref ref-type="fig" rid="F12">Figure 12A</xref>, it is evident that the agent&#x00027;s performance shows minimal sensitivity to changes in the learning rate parameter. Despite fluctuations in the reward curve corresponding to different parameters, the overall trend toward eventual convergence remains consistent. Conversely, <xref ref-type="fig" rid="F12">Figures 12B</xref>, <xref ref-type="fig" rid="F12">C</xref> illustrate that both the world model and curiosity network exhibit sensitivity to changes in the learning rate. Particularly in the world model shown in <xref ref-type="fig" rid="F12">Figure 12B</xref>, excessively small learning rates can lead to failure in achieving convergence during agent training.</p>
<fig id="F12" position="float">
<label>Figure 12</label>
<caption><p>Training reward curves for the CMPO parameter sensitivity experiment. Reward changes for agent training are obtained by increasing and decreasing the learning rate of the current gates and after training once in the base environment with a step size of 500K. <bold>(A)</bold> demonstrates the effect of different learning rates of the agent on training, <bold>(B)</bold> exhibits the impact of different learning rates of the world model on training, and <bold>(C)</bold> compares the influence of different learning rates of the curiosity network on training.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnbot-18-1376215-g0012.tif"/>
</fig></sec></sec></sec>
<sec sec-type="conclusions" id="s8">
<title>8 Conclusion</title>
<p>This study investigates agent-efficient sampling and training for robot manipulators with input saturation in uncertain environments. The combination of the curiosity model and a traditional model-free controller is developed to strengthen trajectory tracking performance. Specifically, a notion of positive&#x02013;negative intrinsic is defined and used in conjunction with adaptive ratios. The gain policies implemented by the agent based on CMPO are empirically concluded to effectively potentiate control performance. In addition, the framework can achieve low-cost fine-tuning to boost tracking capabilities in different scenarios, which facilitates the application. By virtue of these experimental results, augmented model sampling efficiency and competitive control performance are exhibited.</p>
<p>The aforementioned procedure is executed via numerical simulations. In forthcoming advancements, experimental endeavors will leverage robotic manipulators equipped with expanded input&#x02013;output capacities and augmented degrees of freedom. Correspondingly, the creation of increasingly intricate application environments is envisaged.</p></sec>
<sec sec-type="data-availability" id="s9">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p></sec>
<sec sec-type="author-contributions" id="s10">
<title>Author contributions</title>
<p>TW: Conceptualization, Data curation, Investigation, Methodology, Software, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. FW: Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Supervision, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. ZX: Methodology, Project administration, Resources, Supervision, Writing &#x02013; review &#x00026; editing. FQ: Methodology, Project administration, Resources, Supervision, Writing &#x02013; review &#x00026; editing.</p></sec>
</body>
<back>
<sec sec-type="funding-information" id="s11">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This research was supported in part by National Natural Science Foundation of China under grant nos. 62203116 and 62205057, in part by GuangDong Basic and Applied Basic Research Foundation 2024A1515010222, in part by Characteristic Innovation Foundation of Guangdong Education Department under grant 2022ktscx138, KQNCX088, in part by Dongguan Science and Technology of Social Development Program under grant no. 20231800935882.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s13">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fnbot.2024.1376215/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fnbot.2024.1376215/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Boyd</surname> <given-names>S. P.</given-names></name> <name><surname>Vandenberghe</surname> <given-names>L.</given-names></name></person-group> (<year>2004</year>). <source>Convex Optimization</source>. <publisher-loc>Cambridge, UK; New York</publisher-loc>: <publisher-name>Cambridge University Press</publisher-name>. <pub-id pub-id-type="doi">10.1017/CBO9780511804441</pub-id></citation>
</ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Brunke</surname> <given-names>L.</given-names></name> <name><surname>Greeff</surname> <given-names>M.</given-names></name> <name><surname>Hall</surname> <given-names>A. W.</given-names></name> <name><surname>Yuan</surname> <given-names>Z.</given-names></name> <name><surname>Zhou</surname> <given-names>S.</given-names></name> <name><surname>Panerati</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Safe learning in robotics: from learning-based control to safe reinforcement learning</article-title>. <source>Ann. Rev. Control, Robot. Auton. Syst</source>. <volume>5</volume>, <fpage>411</fpage>&#x02013;<lpage>444</lpage>. <pub-id pub-id-type="doi">10.1146/annurev-control-042920-020211</pub-id></citation>
</ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Burda</surname> <given-names>Y.</given-names></name> <name><surname>Edwards</surname> <given-names>H.</given-names></name> <name><surname>Pathak</surname> <given-names>D.</given-names></name> <name><surname>Storkey</surname> <given-names>A.</given-names></name> <name><surname>Darrell</surname> <given-names>T.</given-names></name> <name><surname>Efros</surname> <given-names>A. A.</given-names></name></person-group> (<year>2018a</year>). <article-title>Large-scale study of curiosity-driven learning</article-title>. <source>arXiv preprint arXiv:1808.04355</source>.</citation>
</ref>
<ref id="B4">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Burda</surname> <given-names>Y.</given-names></name> <name><surname>Edwards</surname> <given-names>H.</given-names></name> <name><surname>Storkey</surname> <given-names>A.</given-names></name> <name><surname>Klimov</surname> <given-names>O.</given-names></name></person-group> (<year>2018b</year>). <article-title>Exploration by random network distillation</article-title>. <source>arXiv preprint arXiv:1810.12894</source>.</citation>
</ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cao</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>L.</given-names></name> <name><surname>Jiang</surname> <given-names>J.</given-names></name> <name><surname>Zuo</surname> <given-names>Z.</given-names></name></person-group> (<year>2021</year>). <article-title>Reinforcement learning-based fixed-time trajectory tracking control for uncertain robotic manipulators with input saturation</article-title>. <source>IEEE Trans. Neural Netw. Lear. Syst</source>. <volume>34</volume>, <fpage>4584</fpage>&#x02013;<lpage>4595</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2021.3116713</pub-id><pub-id pub-id-type="pmid">34653006</pub-id></citation></ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chertopolokhov</surname> <given-names>V.</given-names></name> <name><surname>Andrianova</surname> <given-names>O.</given-names></name> <name><surname>Hernandez-Sanchez</surname> <given-names>A.</given-names></name> <name><surname>Mireles</surname> <given-names>C.</given-names></name> <name><surname>Poznyak</surname> <given-names>A.</given-names></name> <name><surname>Chairez</surname> <given-names>I.</given-names></name></person-group> (<year>2023</year>). <article-title>Averaged sub-gradient integral sliding mode control design for cueing end-effector acceleration of a two-link robotic arm</article-title>. <source>ISA Trans</source>. <volume>133</volume>, <fpage>134</fpage>&#x02013;<lpage>146</lpage>. <pub-id pub-id-type="doi">10.1016/j.isatra.2022.07.024</pub-id><pub-id pub-id-type="pmid">35963654</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Chua</surname> <given-names>K.</given-names></name> <name><surname>Calandra</surname> <given-names>R.</given-names></name> <name><surname>McAllister</surname> <given-names>R.</given-names></name> <name><surname>Levine</surname> <given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Deep reinforcement learning in a handful of trials using probabilistic dynamics models,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source> (<publisher-loc>Curran Associates, Inc.</publisher-loc>).</citation>
</ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Elguea-Aguinaco</surname> <given-names>I.</given-names></name> <name><surname>Serrano-Mu&#x000F1;oz</surname> <given-names>A.</given-names></name> <name><surname>Chrysostomou</surname> <given-names>D.</given-names></name> <name><surname>Inziarte-Hidalgo</surname> <given-names>I.</given-names></name> <name><surname>B&#x000F8;gh</surname> <given-names>S.</given-names></name> <name><surname>Arana-Arexolaleiba</surname> <given-names>N.</given-names></name></person-group> (<year>2023</year>). <article-title>A review on reinforcement learning for contact-rich robotic manipulation tasks</article-title>. <source>Robot. Comput. Integr. Manuf</source>. <volume>81</volume>:<fpage>102517</fpage>. <pub-id pub-id-type="doi">10.1016/j.rcim.2022.102517</pub-id></citation>
</ref>
<ref id="B9">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gao</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Xu</surname> <given-names>K.</given-names></name> <name><surname>Zhai</surname> <given-names>Y.</given-names></name> <name><surname>Ding</surname> <given-names>B.</given-names></name> <name><surname>Feng</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2023</year>). &#x0201C;Dynamic memory-based curiosity: A bootstrap approach for exploration in reinforcement learning,&#x0201D; <italic>IEEE Transactions on Emerging Topics in Computational Intelligence</italic>, <fpage>1</fpage>&#x02013;<lpage>13</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Grill</surname> <given-names>J.-B.</given-names></name> <name><surname>Strub</surname> <given-names>F.</given-names></name> <name><surname>Altch&#x000E9;</surname> <given-names>F.</given-names></name> <name><surname>Tallec</surname> <given-names>C.</given-names></name> <name><surname>Richemond</surname> <given-names>P. H.</given-names></name> <name><surname>Buchatskaya</surname> <given-names>E.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Bootstrap your own latent: a new approach to self-supervised learning</article-title>. <source>Adv. Neural Inf. Proc. Syst</source>. <volume>33</volume>, <fpage>21271</fpage>&#x02013;<lpage>21284</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2006.07733</pub-id></citation>
</ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>Q.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Zuo</surname> <given-names>Z.</given-names></name> <name><surname>Shi</surname> <given-names>Y.</given-names></name> <name><surname>Jiang</surname> <given-names>D.</given-names></name></person-group> (<year>2021</year>). <article-title>Quasi-synchronization control of multiple electrohydraulic actuators with load disturbance and uncertain parameters</article-title>. <source>IEEE/ASME Trans. Mechatr</source>. <volume>26</volume>, <fpage>2048</fpage>&#x02013;<lpage>2058</lpage>. <pub-id pub-id-type="doi">10.1109/TMECH.2020.3030032</pub-id></citation>
</ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Haarnoja</surname> <given-names>T.</given-names></name> <name><surname>Zhou</surname> <given-names>A.</given-names></name> <name><surname>Hartikainen</surname> <given-names>K.</given-names></name> <name><surname>Tucker</surname> <given-names>G.</given-names></name> <name><surname>Ha</surname> <given-names>S.</given-names></name> <name><surname>Tan</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Soft actor-critic algorithms and applications</article-title>. <source>arXiv preprint arXiv:1812.05905</source>.</citation>
</ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Liu</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>L.</given-names></name></person-group> (<year>2020</year>). <article-title>Reinforcement learning tracking control for robotic manipulator with kernel-based dynamic model</article-title>. <source>IEEE Trans. Neural Netw. Lear. Syst</source>. <volume>31</volume>, <fpage>3570</fpage>&#x02013;<lpage>3578</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2019.2945019</pub-id><pub-id pub-id-type="pmid">31689218</pub-id></citation></ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>F.</given-names></name> <name><surname>Li</surname> <given-names>W.</given-names></name> <name><surname>Cui</surname> <given-names>J.</given-names></name> <name><surname>Fu</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name></person-group> (<year>2022</year>). <article-title>Unified curiosity-driven learning with smoothed intrinsic reward estimation</article-title>. <source>Patt. Recogn</source>. <volume>123</volume>:<fpage>108352</fpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2021.108352</pub-id></citation>
</ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Islam</surname> <given-names>S.</given-names></name> <name><surname>Liu</surname> <given-names>X. P.</given-names></name></person-group> (<year>2011</year>). <article-title>Robust sliding mode control for robot manipulators</article-title>. <source>IEEE Trans. Ind. Electr</source>. <volume>58</volume>, <fpage>2444</fpage>&#x02013;<lpage>2453</lpage>. <pub-id pub-id-type="doi">10.1109/TIE.2010.2062472</pub-id></citation>
</ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Janner</surname> <given-names>M.</given-names></name> <name><surname>Fu</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Levine</surname> <given-names>S.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;When to trust your model: model-based policy optimization,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, 32.</citation>
</ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kapturowski</surname> <given-names>S.</given-names></name> <name><surname>Campos</surname> <given-names>V.</given-names></name> <name><surname>Jiang</surname> <given-names>R.</given-names></name> <name><surname>Raki&#x00107;evi&#x00107;</surname> <given-names>N.</given-names></name> <name><surname>van Hasselt</surname> <given-names>H.</given-names></name> <name><surname>Blundell</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Human-level Atari 200x faster</article-title>. <source>arXiv preprint arXiv:2209.07550</source>.</citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kurutach</surname> <given-names>T.</given-names></name> <name><surname>Clavera</surname> <given-names>I.</given-names></name> <name><surname>Duan</surname> <given-names>Y.</given-names></name> <name><surname>Tamar</surname> <given-names>A.</given-names></name> <name><surname>Abbeel</surname> <given-names>P.</given-names></name></person-group> (<year>2018</year>). <article-title>Model-ensemble trust-region policy optimization</article-title>. <source>arXiv preprint arXiv:1802.10592</source>.</citation>
</ref>
<ref id="B19">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lai</surname> <given-names>H.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>W.</given-names></name> <name><surname>Huang</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Tang</surname> <given-names>R.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;On effective scheduling of model-based reinforcement learning,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source> (<publisher-loc>Curran Associates, Inc.</publisher-loc>), <fpage>3694</fpage>&#x02013;<lpage>3705</lpage>.</citation>
</ref>
<ref id="B20">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lai</surname> <given-names>H.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>W.</given-names></name> <name><surname>Yu</surname> <given-names>Y.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Bidirectional model-based policy optimization,&#x0201D;</article-title> in <source>Proceedings of the 37th International Conference on Machine Learning</source> (<publisher-loc>PMLR</publisher-loc>), <fpage>5618</fpage>&#x02013;<lpage>5627</lpage>.</citation>
</ref>
<ref id="B21">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>K.</given-names></name> <name><surname>Seo</surname> <given-names>Y.</given-names></name> <name><surname>Lee</surname> <given-names>S.</given-names></name> <name><surname>Lee</surname> <given-names>H.</given-names></name> <name><surname>Shin</surname> <given-names>J.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Context-aware dynamics model for generalization in model-based reinforcement learning,&#x0201D;</article-title> in <source>Proceedings of the 37th International Conference on Machine Learning</source> (<publisher-loc>PMLR</publisher-loc>), <fpage>5757</fpage>&#x02013;<lpage>5766</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Shi</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name></person-group> (<year>2020</year>). <article-title>Random curiosity-driven exploration in deep reinforcement learning</article-title>. <source>Neurocomputing</source> <volume>418</volume>, <fpage>139</fpage>&#x02013;<lpage>147</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2020.08.024</pub-id></citation>
</ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lu</surname> <given-names>P.</given-names></name> <name><surname>Huang</surname> <given-names>W.</given-names></name> <name><surname>Xiao</surname> <given-names>J.</given-names></name> <name><surname>Zhou</surname> <given-names>F.</given-names></name> <name><surname>Hu</surname> <given-names>W.</given-names></name></person-group> (<year>2021</year>). <article-title>Adaptive proportional integral robust control of an uncertain robotic manipulator based on deep deterministic policy gradient</article-title>. <source>Mathematics</source> <volume>9</volume>:<fpage>2055</fpage>. <pub-id pub-id-type="doi">10.3390/math9172055</pub-id></citation>
</ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Luo</surname> <given-names>F.-M.</given-names></name> <name><surname>Xu</surname> <given-names>T.</given-names></name> <name><surname>Lai</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>X.-H.</given-names></name> <name><surname>Zhang</surname> <given-names>W.</given-names></name> <name><surname>Yu</surname> <given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>A survey on model-based reinforcement learning</article-title>. <source>Sci. China Inf. Sci</source>. <volume>67</volume>:<fpage>121101</fpage>. <pub-id pub-id-type="doi">10.1007/s11432-022-3696-5</pub-id></citation>
</ref>
<ref id="B25">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Luo</surname> <given-names>Y.</given-names></name> <name><surname>Xu</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name> <name><surname>Darrell</surname> <given-names>T.</given-names></name> <name><surname>Ma</surname> <given-names>T.</given-names></name></person-group> (<year>2021</year>). <article-title>Algorithmic framework for model-based deep reinforcement learning with theoretical guarantees</article-title>. <source>arXiv preprint arXiv:1807.03858</source>.</citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mnih</surname> <given-names>V.</given-names></name> <name><surname>Kavukcuoglu</surname> <given-names>K.</given-names></name> <name><surname>Silver</surname> <given-names>D.</given-names></name> <name><surname>Graves</surname> <given-names>A.</given-names></name> <name><surname>Antonoglou</surname> <given-names>I.</given-names></name> <name><surname>Wierstra</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2013</year>). <article-title>Playing atari with deep reinforcement learning</article-title>. <source>arXiv preprint arXiv:1312.5602</source>.</citation>
</ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pane</surname> <given-names>Y. P.</given-names></name> <name><surname>Nageshrao</surname> <given-names>S. P.</given-names></name> <name><surname>Kober</surname> <given-names>J.</given-names></name> <name><surname>Babu&#x00161;ka</surname> <given-names>R.</given-names></name></person-group> (<year>2019</year>). <article-title>Reinforcement learning based compensation methods for robot manipulators</article-title>. <source>Eng. Applic. Artif. Intell</source>. <volume>78</volume>, <fpage>236</fpage>&#x02013;<lpage>247</lpage>. <pub-id pub-id-type="doi">10.1016/j.engappai.2018.11.006</pub-id></citation>
</ref>
<ref id="B28">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Pathak</surname> <given-names>D.</given-names></name> <name><surname>Agrawal</surname> <given-names>P.</given-names></name> <name><surname>Efros</surname> <given-names>A. A.</given-names></name> <name><surname>Darrell</surname> <given-names>T.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Curiosity-driven exploration by self-supervised prediction,&#x0201D;</article-title> in <source>Proceedings of the 34th International Conference on Machine Learning</source> (<publisher-loc>PMLR</publisher-loc>), <fpage>2778</fpage>&#x02013;<lpage>2787</lpage>. <pub-id pub-id-type="doi">10.1109/CVPRW.2017.70</pub-id></citation>
</ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Peng</surname> <given-names>B.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Gao</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>J.</given-names></name> <name><surname>Wong</surname> <given-names>K.-F.</given-names></name> <name><surname>Su</surname> <given-names>S.-Y.</given-names></name></person-group> (<year>2018</year>). <article-title>Deep dyna-q: integrating planning for task-completion dialogue policy learning</article-title>. <source>arXiv preprint arXiv:1801.06176</source>.</citation>
</ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Zhao</surname> <given-names>H.</given-names></name> <name><surname>Zhang</surname> <given-names>W.</given-names></name> <name><surname>Yu</surname> <given-names>Y.</given-names></name></person-group> (<year>2020</year>). <article-title>Model-based policy optimization with unsupervised model adaptation</article-title>. <source>Adv. Neural Inf. Proc. Syst</source>. <volume>33</volume>, <fpage>2823</fpage>&#x02013;<lpage>2834</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2010.09546</pub-id><pub-id pub-id-type="pmid">24520315</pub-id></citation></ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stadie</surname> <given-names>B. C.</given-names></name> <name><surname>Levine</surname> <given-names>S.</given-names></name> <name><surname>Abbeel</surname> <given-names>P.</given-names></name></person-group> (<year>2015</year>). <article-title>Incentivizing exploration in reinforcement learning with deep predictive models</article-title>. <source>arXiv preprint arXiv:1507.00814</source>.</citation>
</ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>C.</given-names></name> <name><surname>Qian</surname> <given-names>H.</given-names></name> <name><surname>Miao</surname> <given-names>C.</given-names></name></person-group> (<year>2022a</year>). <article-title>CCLF: a contrastive-curiosity-driven learning framework for sample-efficient reinforcement learning</article-title>. <source>arXiv preprint arXiv:2205.00943</source>.</citation>
</ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>C.</given-names></name> <name><surname>Qian</surname> <given-names>H.</given-names></name> <name><surname>Miao</surname> <given-names>C.</given-names></name></person-group> (<year>2022b</year>). <article-title>From psychological curiosity to artificial curiosity: curiosity-driven learning in artificial intelligence tasks</article-title>. <source>arXiv preprint arXiv:2201.08300</source>.</citation>
</ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Thuruthel</surname> <given-names>T. G.</given-names></name> <name><surname>Falotico</surname> <given-names>E.</given-names></name> <name><surname>Renda</surname> <given-names>F.</given-names></name> <name><surname>Laschi</surname> <given-names>C.</given-names></name></person-group> (<year>2019</year>). <article-title>Model-Based Reinforcement Learning for Closed-Loop Dynamic Control of Soft Robotic Manipulators</article-title>. <source>IEEE Trans. Robot</source>. <volume>35</volume>, <fpage>124</fpage>&#x02013;<lpage>134</lpage>. <pub-id pub-id-type="doi">10.1109/TRO.2018.2878318</pub-id><pub-id pub-id-type="pmid">38109254</pub-id></citation></ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>S.</given-names></name> <name><surname>Yin</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>P.</given-names></name> <name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name></person-group> (<year>2020</year>). <article-title>Trajectory tracking control for mobile robots using reinforcement learning and PID</article-title>. <source>Iranian J. Sci. Technol. Trans. Electr. Eng</source>. <volume>44</volume>, <fpage>1059</fpage>&#x02013;<lpage>1068</lpage>. <pub-id pub-id-type="doi">10.1007/s40998-019-00286-4</pub-id></citation>
</ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>Y.</given-names></name> <name><surname>Lyu</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>W.</given-names></name> <name><surname>Yu</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Guo</surname> <given-names>L.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Contact force estimation of robot manipulators with imperfect dynamic model: on gaussian process adaptive disturbance kalman filter,&#x0201D;</article-title> in <source>IEEE Transactions on Automation Science and Engineering</source>, 1&#x02013;14. <pub-id pub-id-type="doi">10.1109/TASE.2023.3280750</pub-id></citation>
</ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>K.</given-names></name> <name><surname>Wu</surname> <given-names>M.</given-names></name> <name><surname>Chen</surname> <given-names>Z.</given-names></name> <name><surname>Xu</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Generalizing reinforcement learning through fusing self-supervised learning into intrinsic motivation,&#x0201D;</article-title> in <source>Proceedings of the AAAI Conference on Artificial Intelligence</source>, 8683&#x02013;8690. <pub-id pub-id-type="doi">10.1609/aaai.v36i8.20847</pub-id></citation>
</ref>
<ref id="B38">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>J.</given-names></name> <name><surname>Hou</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Xu</surname> <given-names>B.</given-names></name> <name><surname>Zhang</surname> <given-names>K.</given-names></name> <name><surname>Chen</surname> <given-names>K.</given-names></name></person-group> (<year>2019</year>). <article-title>Feedback deep deterministic policy gradient with fuzzy reward for robotic multiple peg-in-hole assembly tasks</article-title>. <source>IEEE Trans. Industr. Inf</source>. <volume>15</volume>, <fpage>1658</fpage>&#x02013;<lpage>1667</lpage>. <pub-id pub-id-type="doi">10.1109/TII.2018.2868859</pub-id></citation>
</ref>
<ref id="B39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>H.-K.</given-names></name> <name><surname>Chiang</surname> <given-names>P.-H.</given-names></name> <name><surname>Ho</surname> <given-names>K.-W.</given-names></name> <name><surname>Hong</surname> <given-names>M.-F.</given-names></name> <name><surname>Lee</surname> <given-names>C.-Y.</given-names></name></person-group> (<year>2019</year>). <article-title>Never forget: balancing exploration and exploitation via learning optical flow</article-title>. <source>arXiv preprint arXiv:1901.08486</source>.</citation>
</ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>F.</given-names></name> <name><surname>Cheng</surname> <given-names>M.</given-names></name> <name><surname>Ding</surname> <given-names>R.</given-names></name> <name><surname>Xu</surname> <given-names>B.</given-names></name> <name><surname>Zong</surname> <given-names>H.</given-names></name></person-group> (<year>2024</year>). <article-title>Parameter identification of hydraulic manipulators considering physical feasibility and control stability</article-title>. <source>IEEE Trans. Industr. Electr</source>. <volume>71</volume>, <fpage>718</fpage>&#x02013;<lpage>728</lpage>. <pub-id pub-id-type="doi">10.1109/TIE.2023.3250753</pub-id></citation>
</ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhelo</surname> <given-names>O.</given-names></name> <name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Tai</surname> <given-names>L.</given-names></name> <name><surname>Liu</surname> <given-names>M.</given-names></name> <name><surname>Burgard</surname> <given-names>W.</given-names></name></person-group> (<year>2018</year>). <article-title>Curiosity-driven exploration for mapless navigation with deep reinforcement learning</article-title>. <source>arXiv preprint arXiv:1804.00456</source>.</citation>
</ref>
</ref-list>
</back>
</article>