<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="review-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Robot. AI</journal-id>
<journal-title>Frontiers in Robotics and AI</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Robot. AI</abbrev-journal-title>
<issn pub-type="epub">2296-9144</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">799893</article-id>
<article-id pub-id-type="doi">10.3389/frobt.2022.799893</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Robotics and AI</subject>
<subj-group>
<subject>Review</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Robot Learning From Randomized Simulations: A Review</article-title>
<alt-title alt-title-type="left-running-head">Muratore et al.</alt-title>
<alt-title alt-title-type="right-running-head">Robot Learning From Randomized Simulations</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Muratore</surname>
<given-names>Fabio</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1525990/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ramos</surname>
<given-names>Fabio</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Turk</surname>
<given-names>Greg</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yu</surname>
<given-names>Wenhao</given-names>
</name>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gienger</surname>
<given-names>Michael</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/166562/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Peters</surname>
<given-names>Jan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Intelligent Autonomous Systems Group</institution>, <institution>Technical University of Darmstadt</institution>, <addr-line>Darmstadt</addr-line>, <country>Germany</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Honda Research Institute Europe</institution>, <addr-line>Offenbach am Main</addr-line>, <country>Germany</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>School of Computer Science</institution>, <institution>University of Sydney</institution>, <addr-line>Sydney</addr-line>, <addr-line>NSW</addr-line>, <country>Australia</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>NVIDIA</institution>, <addr-line>Seattle</addr-line>, <addr-line>WA</addr-line>, <country>United States</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Georgia Institute of Technology</institution>, <addr-line>Atlanta</addr-line>, <addr-line>GA</addr-line>, <country>United States</country>
</aff>
<aff id="aff6">
<sup>6</sup>
<institution>Robotics at Google</institution>, <addr-line>Mountain View</addr-line>, <addr-line>CA</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/134032/overview">Antonio Fern&#xe1;ndez-Caballero</ext-link>, University of Castilla-La Mancha, Spain</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/386027/overview">Akansel Cosgun</ext-link>, Monash University, Australia</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/482341/overview">Konstantinos Chatzilygeroudis</ext-link>, University of Patras, Greece</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Fabio Muratore, <email>fabio@robot-learning.de</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Robot Learning and Evolution, a section of the journal Frontiers in Robotics and AI</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>11</day>
<month>04</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>9</volume>
<elocation-id>799893</elocation-id>
<history>
<date date-type="received">
<day>22</day>
<month>10</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>21</day>
<month>01</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Muratore, Ramos, Turk, Yu, Gienger and Peters.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Muratore, Ramos, Turk, Yu, Gienger and Peters</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>The rise of deep learning has caused a paradigm shift in robotics research, favoring methods that require large amounts of data. Unfortunately, it is prohibitively expensive to generate such data sets on a physical platform. Therefore, state-of-the-art approaches learn in simulation where data generation is fast as well as inexpensive and subsequently transfer the knowledge to the real robot (sim-to-real). Despite becoming increasingly realistic, all simulators are by construction based on models, hence inevitably imperfect. This raises the question of how simulators can be modified to facilitate learning robot control policies and overcome the mismatch between simulation and reality, often called the &#x201c;reality gap.&#x201d; We provide a comprehensive review of sim-to-real research for robotics, focusing on a technique named &#x201c;domain randomization&#x201d; which is a method for learning from randomized simulations.</p>
</abstract>
<kwd-group>
<kwd>robotics</kwd>
<kwd>simulation</kwd>
<kwd>reality gap</kwd>
<kwd>simulation optimization bias</kwd>
<kwd>reinforcement learning</kwd>
<kwd>domain randomization</kwd>
<kwd>sim-to-real</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Given that machine learning has achieved super-human performance in image classification (<xref ref-type="bibr" rid="B36">Ciresan et al., 2012</xref>; <xref ref-type="bibr" rid="B93">Krizhevsky et al., 2012</xref>) and games (<xref ref-type="bibr" rid="B119">Mnih et al., 2015</xref>; <xref ref-type="bibr" rid="B158">Silver et al., 2016</xref>), the question arises why we do not see similar results in robotics. There are several reasons for this. First, learning to act in the physical world is orders of magnitude more difficult. While the data required by modern (deep) learning algorithms could be acquired directly on a real robot (<xref ref-type="bibr" rid="B100">Levine et al., 2018</xref>), this solution is too expensive in terms of time and resources to scale up. Alternatively, the data can be generated in simulation faster, cheaper, safer, and with unmatched diversity. In doing so, we have to cope with unavoidable approximation errors that we make when modeling reality. These errors, often referred to as the &#x201c;reality gap,&#x201d; originate from omitting physical phenomena, inaccurate parameter estimation, or the discretized numerical integration in typical solvers. Compounding this issue, state-of-the-art (deep) learning methods are known to be brittle (<xref ref-type="bibr" rid="B163">Szegedy et al., 2014</xref>; <xref ref-type="bibr" rid="B62">Goodfellow et al., 2015</xref>; <xref ref-type="bibr" rid="B73">Huang et al., 2017</xref>), that is, sensitive to shifts in their input domains. Additionally, the learner is free to exploit the simulator, overfitting to features which do not occur in the real world. For example, <xref ref-type="bibr" rid="B11">Baker et al. (2020)</xref> noticed that the agents learned to exploit the physics engine to gain an unexpected advantage. While this exploitation is an interesting observation for studies made entirely in simulation, it is highly undesirable in sim-to-real scenarios. In the best case, the reality gap manifests itself as a performance drop, giving a lower success rate or reduced tracking accuracy. More likely, the learned policy is not transferable to the robot because of unknown physical effects. One effect that is difficult to model is friction, often leading to an underestimation thereof in simulation, which can result in motor commands that are not strong enough to get the robot moving. Another reason for failure are parameter estimation errors, which can quickly lead to unstable system dynamics. This case is particularly dangerous for the human and the robot. For these reasons, bridging the reality gap is the essential step to endow robots with the ability to learn from simulated experience.</p>
<p>There is a consensus that further increasing the simulator&#x2019;s accuracy alone will not bridge this gap (<xref ref-type="bibr" rid="B66">H&#xf6;fer et al., 2020</xref>). Looking at breakthroughs in machine learning, we see that deep models in combination with large and diverse data sets lead to better generalization (<xref ref-type="bibr" rid="B150">Russakovsky et al., 2015</xref>; <xref ref-type="bibr" rid="B143">Radford et al., 2019</xref>). In a similar spirit, a technique called domain randomization has recently gained momentum (<xref ref-type="fig" rid="F1">Figure 1</xref>). The common characteristic of such approaches is the perturbation of simulator parameters, state observations, or applied actions. Typical quantities to randomize include the bodies&#x2019; inertia and geometry, the parameters of the friction and contact models, possible delays in the actuation, efficiency coefficients of motors, levels of sensor noise, as well as visual properties such as colors, illumination, position and orientation of a camera, or additional artifacts to the image (e.g., glare). Domain randomization can be seen as a regularization method that prevents the learner from overfitting to individual simulation instances. From the Bayesian perspective, we can interpret the distribution over simulators as a representation of uncertainty.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Examples of sim-to-real robot learning research using domain randomization: (left) Multiple simulation instances of robotic in-hand manipulation (<xref ref-type="bibr" rid="B6">OpenAI et al., 2020</xref>), (middle top) transformation to a canonical simulation (<xref ref-type="bibr" rid="B79">James et al., 2019</xref>), (middle bottom) synthetic 3D hallways generated for indoor drone flight (<xref ref-type="bibr" rid="B154">Sadeghi and Levine, 2017</xref>), (right top) ball-in-a-cup task solved with adaptive dynamics randomization (<xref ref-type="bibr" rid="B124">Muratore et al., 2021a</xref>), (right bottom) quadruped locomotion (<xref ref-type="bibr" rid="B164">Tan et al., 2018</xref>).</p>
</caption>
<graphic xlink:href="frobt-09-799893-g001.tif"/>
</fig>
<p>In this paper, we first introduce the necessary nomenclature and mathematical fundamentals for the problem (<xref ref-type="sec" rid="s2">Section 2</xref>). Next, we review early approaches for learning from randomized simulations, state the practical requirements, and describe measures for sim-to-real transferability (<xref ref-type="sec" rid="s3">Section 3</xref>). Subsequently, we discuss the connections between research on sim-to-real transfer and related fields (<xref ref-type="sec" rid="s4">Section 4</xref>). Moreover, we introduce a taxonomy for domain randomization and categorize the current state of the art (<xref ref-type="sec" rid="s5">Section 5</xref>). Finally, we conclude and outline possible future research directions (<xref ref-type="sec" rid="s6">Section 6</xref>). For those who want to first become more familiar with robot policy learning as well as policy search, we recommend these surveys: <xref ref-type="bibr" rid="B89">Kober et al. (2013)</xref>, <xref ref-type="bibr" rid="B49">Deisenroth et al. (2013)</xref>, and <xref ref-type="bibr" rid="B30">Chatzilygeroudis et al. (2020)</xref>.</p>
</sec>
<sec id="s2">
<title>2 Problem Formulation and Nomenclature</title>
<p>We begin our discussion by defining critical concepts and nomenclature used throughout this article.</p>
<p>Markov Decision Processes (MDPs): Consider a discrete-time dynamical system<disp-formula id="e1">
<mml:math id="m1">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:math>
<label>(1)</label>
</disp-formula>with the continuous state <inline-formula id="inf1">
<mml:math id="m2">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2286;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> and continuous action <inline-formula id="inf2">
<mml:math id="m3">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2286;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> at time step <italic>t</italic>. The environment, also called domain, is characterized by its parameters <inline-formula id="inf3">
<mml:math id="m4">
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> (e.g., masses, friction coefficients, time delays, or surface appearance properties) which are in general assumed to be random variables distributed according to an unknown probability distribution <inline-formula id="inf4">
<mml:math id="m5">
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>:</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2192;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>. A special case of this is the common assumption that the domain parameters obey a parametric distribution <inline-formula id="inf5">
<mml:math id="m6">
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3d5;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> with unknown parameters <bold>
<italic>&#x3d5;</italic>
</bold> (e.g., mean and variance). The domain parameters determine the transition probability density function <inline-formula id="inf6">
<mml:math id="m7">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> that describes the system&#x2019;s stochastic dynamics. The initial state <bold>
<italic>s</italic>
</bold>
<sub>0</sub> is drawn from the start state distribution <inline-formula id="inf7">
<mml:math id="m8">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>. In general, the instantaneous reward is a random variable depending on the current state and action as well as the next state. Here we make the common simplification that the reward is a deterministic function of the current state and action <inline-formula id="inf8">
<mml:math id="m9">
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>:</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:math>
</inline-formula> Together with the temporal discount factor <italic>&#x3b3;</italic> &#x2208; [0, 1], the system forms a MDP described by the tuple <inline-formula id="inf9">
<mml:math id="m10">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="&#x27e8;" close="&#x27e9;">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>.</p>
<p>Reinforcement Learning (RL): The goal of a RL agent is to maximize the expected (discounted) return, a numeric scoring function which measures the policy&#x2019;s performance. The expected discounted return of a policy <inline-formula id="inf10">
<mml:math id="m11">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> with the parameters <inline-formula id="inf11">
<mml:math id="m12">
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">&#x398;</mml:mi>
<mml:mo>&#x2286;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> is defined as<disp-formula id="e2">
<mml:math id="m13">
<mml:mi>J</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mspace width="-0.17em"/>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mspace width="0.3333em"/>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c0;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mspace width="-0.17em"/>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mfenced open="|" close="">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:math>
<label>(2)</label>
</disp-formula>While learning from experience, the agent adapts its policy parameters. The resulting state-action-reward tuples are collected in trajectories, a.k.a. rollouts, <inline-formula id="inf12">
<mml:math id="m14">
<mml:mi mathvariant="bold-italic">&#x3c4;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">T</mml:mi>
</mml:math>
</inline-formula> with <inline-formula id="inf13">
<mml:math id="m15">
<mml:msub>
<mml:mrow>
<mml:mi>r</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula>. In a partially observable MDP, the policy&#x2019;s input would not be the state but observations there of <inline-formula id="inf14">
<mml:math id="m16">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">o</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2286;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, which are obtained through an environment-specific mapping <bold>
<italic>o</italic>
</bold>
<sub>
<italic>t</italic>
</sub> &#x3d; <italic>f</italic>
<sub>obs</sub> (<bold>
<italic>s</italic>
</bold>
<sub>
<italic>t</italic>
</sub>).</p>
<p>Domain randomization: When augmenting the RL setting with domain randomization, the goal becomes to maximize the expected (discounted) return for a distribution of domain parameters<disp-formula id="e3">
<mml:math id="m17">
<mml:mi>J</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mspace width="-0.17em"/>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:mi>J</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mspace width="-0.17em"/>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="double-struck">E</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3c4;</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3c4;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mspace width="-0.17em"/>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mfenced open="|" close="">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3be;</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:math>
<label>(3)</label>
</disp-formula>The outer expectation with respect to the domain parameter distribution <italic>p</italic>(<bold>
<italic>&#x3be;</italic>
</bold>) is the key difference compared to the standard MDP formulation. It enables the learning of robust policies, in the sense that these policies work for a whole set of environments instead of overfitting to a particular problem instance.</p>
</sec>
<sec id="s3">
<title>3 Foundations of Sim-To-Real Transfer</title>
<p>Modern research on learning from (randomized) physics simulations is based on solid foundation of prior work (<xref ref-type="sec" rid="s3-1">Section 3.1</xref>). Parametric simulators are the core component of every sim-to-real method (<xref ref-type="sec" rid="s3-2">Section 3.2</xref>). Even though the details of their randomization are crucial, they are rarely discussed (<xref ref-type="sec" rid="s3-3">Section 3.3</xref>). Estimating the sim-to-real transferability during or after learning allows one to assess or predict the policy&#x2019;s performance in the target domain (<xref ref-type="sec" rid="s3-4">Section 3.4</xref>).</p>
<sec id="s3-1">
<title>3.1 Early Methods</title>
<p>The roots of randomized simulations trace back to the invention of the Monte Carlo method (<xref ref-type="bibr" rid="B118">Metropolis and Ulam, 1949</xref>), which computes its results based on repeated random sampling and subsequent statistical analysis. Later, the concept of common random numbers, also called correlated sampling, was developed as a variance reduction technique (<xref ref-type="bibr" rid="B83">Kahn and Marshall, 1953</xref>; <xref ref-type="bibr" rid="B176">Wright and Ramsay, 1979</xref>). The idea is to synchronize the random numbers for all stochastic events across the simulation runs to achieve a (desirably positive) correlation between random variables reducing the variance of an estimator based on a combination of them. Many of the sim-to-real challenges which are currently tackled have already been identified by <xref ref-type="bibr" rid="B24">Brooks (1992)</xref>. In particular, Brooks addresses the overfitting to effects which only occur in simulation as well as the idealized modeling on sensing and actuation. To avoid overfitting, he advocated for reactive behavior-based programming which is deeply rooted in, hence tailored to, the embodiment. Focusing on RL, <xref ref-type="bibr" rid="B161">Sutton (1991)</xref> introduced the Dyna architecture which revolves around predicting from a learned world model and updating the policy from this hypothetical experience. Viewing the data generated from randomized simulators as &#x201c;imaginary,&#x201d; emphasizes the parallels of domain randomization to Dyna. As stated by Sutton, the usage of &#x201c;mental rehearsal&#x201d; to predict and reason about the effect of actions dates back even further in other fields of research such as psychology (<xref ref-type="bibr" rid="B39">Craik, 1943</xref>; <xref ref-type="bibr" rid="B52">Dennett, 1975</xref>). Instead of querying a learned internal model, <xref ref-type="bibr" rid="B77">Jakobi et al. (1995)</xref> added random noise the sensors and actuators while learning, achieving the arguably first sim-to-real transfer in robotics. In follow-up work, <xref ref-type="bibr" rid="B76">Jakobi (1997)</xref> formulated the radical envelope of noise hypothesis which states that &#x201c;it does not matter how inaccurate or incomplete [the simulations] are: controllers that have evolved to be reliably fit in simulation will still transfer into reality.&#x201d; Picking up on the idea of common random numbers, <xref ref-type="bibr" rid="B129">Ng and Jordan (2000)</xref> suggested to explicitly control the randomness of a simulator, i.e., the random number generator&#x2019;s state, rendering the simulator deterministic. This way the same initial configurations can be (re-)used for Monte Carlo estimations of different policies&#x2019; value functions, allowing one to conduct policy search in partially observable problems. <xref ref-type="bibr" rid="B20">Bongard et al. (2006)</xref> bridged the sim-to-real gap through iterating model generation and selection depending on the short-term state-action history. This process is repeated for a given number of iterations, and then yields the self-model, i.e., a simulator, which best explains the observed data.</p>
<p>Inspired by these early approaches, the systematic analysis of randomized simulations for robot learning has become a highly active research direction. Moreover, the prior work above also falsifies the common belief that domain randomization originated recently with the rise of deep learning. Nevertheless, the current popularity of domain randomization can be explained by its widespread use in the computer vision and locomotion communities as well as its synergies with deep learning methods. The key difference between the early and the recent domain randomization methods (<xref ref-type="sec" rid="s5">Section 5</xref>) is that the latter (directly) manipulate the simulators&#x2019; parameters.</p>
</sec>
<sec id="s3-2">
<title>3.2 Constructing Stochastic Simulators</title>
<p>Simulators can be obtained by implementing a set of physical laws for a particular system. Given the challenges in implementing an efficient simulator for complex systems, it is common to use general purpose physics engines such as ODE, DART, Bullet, Newton, SimBody, Vortex, MuJoCo, Havok, Chrono, RaiSim, PhysX, FleX, or Brax. These simulators are parameterized generative models, which describe how multiple bodies or particles evolve over time by interacting with each other. The associated physics parameters can be estimated by system identification (<xref ref-type="sec" rid="s4-6">Section 4.6</xref>), which generally involves executing experiments on the physical platform and recording associated measurement. Additionally, using the Gauss-Markov theorem one could also compute the parameters&#x2019; covariance and hence construct a normal distribution for each domain parameter. Differentiable simulators facilitate deep learning for robotics (<xref ref-type="bibr" rid="B48">Degrave et al., 2019</xref>; <xref ref-type="bibr" rid="B38">Coumans, 2020</xref>; <xref ref-type="bibr" rid="B69">Heiden et al., 2021</xref>) by propagating the gradients though the dynamics. Current research extends the differentiability to soft body dynamics (<xref ref-type="bibr" rid="B72">Hu et al., 2019</xref>). Alternatively, the system dynamics can be captured using nonparametric methods like Gaussian Processes (GPs) (<xref ref-type="bibr" rid="B147">Rasmussen and Williams, 2006</xref>) as for example demonstrated by <xref ref-type="bibr" rid="B26">Calandra et al. (2015)</xref>. It is important to keep in mind that even if the domain parameters have been identified very accurately, simulators are nevertheless just approximations of the real world and are thus always imperfect.</p>
<p>Several comparisons between various physics engines were made (<xref ref-type="bibr" rid="B75">Ivaldi et al., 2014</xref>; <xref ref-type="bibr" rid="B56">Erez et al., 2015</xref>; <xref ref-type="bibr" rid="B35">Chung and Pollard, 2016</xref>; <xref ref-type="bibr" rid="B37">Collins et al., 2019</xref>; <xref ref-type="bibr" rid="B81">K&#xf6;rber et al., 2021</xref>). However, note that these results become outdated quickly due to the rapid development in the field, or are often limited to very few scenarios and partially introduce custom metrics to measure their performance or accuracy.</p>
<p>Apart from the physics engines listed above, there is an orthogonal research direction investigating human-inspired learning of the physics laws from visual input (<xref ref-type="bibr" rid="B14">Battaglia et al., 2013</xref>; <xref ref-type="bibr" rid="B177">Wu et al., 2015</xref>) as well as physical reasoning given a configuration of bodies (<xref ref-type="bibr" rid="B15">Battaglia et al., 2016</xref>), which is out of the scope of this review.</p>
</sec>
<sec id="s3-3">
<title>3.3 Randomizing a Simulator</title>
<p>Learning from randomized simulations entails significant design decisions:</p>
<p>Which parameters should be randomized? Depending on the problem, some domain parameters have no influence (e.g., the mass of an idealized rolling ball) while others are pivotal (e.g., the pendulum length for a stabilization task). It is recommended to first identify the essential parameters (<xref ref-type="bibr" rid="B179">Xie et al., 2020</xref>). For example, most robot locomotion papers highlight the importance of varying the terrain and contact models, while applications such as drone control benefit from adding perturbations, e.g., to simulate a gust of wind. Injecting random latency and noise to the actuation is another frequent modeling choice. Starting from a small set of randomized domain parameters, identified from prior knowledge, has the additional benefit of shortening the evaluation time which involves approximating an expectation over domains, which scales exponentially with the number of parameters. Moreover, including at least one visually observable parameter (e.g., an extent of a body) helps to verify if the values are set as expected.</p>
<p>When should the parameters be randomized? Episodic dynamics randomization, without a rigorous theoretical justification, is the most common approach. Randomizing the domain parameters at every time step instead would drastically increase the variance, and pose a challenge to the implementations since this typically implies recreating the simulation at every step. Imagine a stack of cubes standing on the ground. If we now vary the cubes&#x2019; side lengths individually while keeping their absolute positions fixed, they will either lose contact or intersect with their neighboring cube(s). In order to keep the stack intact, we need to randomize the cubes with respect to their neighbors, additionally moving them in space. Executing this once at the beginning is fine, but doing this at every step creates artificial &#x201c;movement&#x201d; which would almost certainly be detrimental. Orthogonal to the argumentation above, alternative approaches apply random disturbance forces and torques at every time step. In these cases, the distribution over disturbance magnitudes is chosen to be constant until the randomization scheme is updated. To the best of our knowledge, event-triggered randomization has not been explored yet.</p>
<p>How should the parameters be randomized? Answering this question is what characterizes a domain randomization method (<xref ref-type="sec" rid="s5">Section 5</xref>). There are a few aspects that needs to be considered in practice when designing a domain randomization scheme, such as the numerical stability of the simulation instances. Low masses for example quickly lead to stiff differential equations which might require a different (implicit) integrator. Furthermore, the noise level of the introduced randomness needs to match the precision of the state estimation. If the noise is too low, the randomization is pointless. On the other side, if the noise level is too high, the learning procedure will fail. To find the right balance between these considerations, we can start by statistically analyzing the incoming measurement signals.</p>
<p>What about physical plausibility? The application of pseudo-random color patterns, e.g., Perlin noise (<xref ref-type="bibr" rid="B137">Perlin, 2002</xref>), has become a frequent choice for computer vision applications. Despite that these patterns do not occur on real-world objects, this technique has improved the robustness of object detectors (<xref ref-type="bibr" rid="B78">James et al., 2017</xref>; <xref ref-type="bibr" rid="B139">Pinto et al., 2018</xref>). Regarding the randomization of dynamics parameters, no research has so far hinted that physically implausible simulations (e.g., containing bodies with negative masses) are useful. On the other hand, it is safe to say that these can cause numerical instabilities. Thus, ensuring feasibility of the resulting simulator is highly desirable. One solution is to project the domain parameters into a different space, guaranteeing physical plausibility via the inverse projection. For example, a body&#x2019;s mass could be learned in the log-space such that the subsequent exp-transformation, applied before setting the new parameter value, yields strictly positive numbers. However, most of the existing domain randomization approaches can not guarantee physical plausibility.</p>
<p>Even in the case of rigid body dynamics there are notable differences between physics engines, as was observed by <xref ref-type="bibr" rid="B127">Muratore et al. (2018)</xref> when transferring a robot control policy trained using Vortex to Bullet and vice versa. Typical sources for deviations are different coordinate representations, numerical solvers, friction and contact models. Especially the latter two are decisive for robot manipulation. For vision-based tasks, <xref ref-type="bibr" rid="B2">Alghonaim and Johns (2020)</xref> found a strong correlation between the renderer&#x2019;s quality and sim-to-real transferability. Additionally, the authors emphasize the importance of randomizing both distractor objects and background textures for generalizing to unseen environments.</p>
</sec>
<sec id="s3-4">
<title>3.4 Measuring and Predicting the Reality Gap</title>
<p>Coining the term &#x201c;reality gap,&#x201d; <xref ref-type="bibr" rid="B91">Koos et al. (2010)</xref> hypothesize that the fittest solutions in simulation often rely on poorly simulated phenomena. From this, they derive a multi-objective formulation for sim-to-real transfer where performance and transferability need to be balanced. In subsequent work, <xref ref-type="bibr" rid="B92">Koos et al. (2013)</xref> defined a transferability function that maps controller parameters to their estimated target domain performance. A surrogate model of this function is regressed from the real-world fitness values that are obtained by executing the controllers found in simulation.</p>
<p>The Simulation Optimization Bias (SOB) (<xref ref-type="bibr" rid="B127">Muratore et al., 2018</xref>; <xref ref-type="bibr" rid="B125">2021b</xref>) is a quantitative measure for the transferability of a control policy from a set of source domains to a different target domain originating from the same distribution. Building on the formulation of the optimality gap from convex optimization (<xref ref-type="bibr" rid="B109">Mak et al., 1999</xref>; <xref ref-type="bibr" rid="B16">Bayraksan and Morton, 2006</xref>), <xref ref-type="bibr" rid="B127">Muratore et al. (2018)</xref> proposed a Monte Carlo estimator of the SOB as well as an upper confidence bound, tailored to reinforcement learning settings. This bound can be used as an indicator to stop training when the predicted transferability exceeds a threshold. Moreover, the authors show that the SOB is always positive, i.e., optimistic, and in expectation monotonically decreases with an increasing number of domains.</p>
<p>
<xref ref-type="bibr" rid="B37">Collins et al. (2019)</xref> quantify the accuracy of ODE, (Py)Bullet, Newton, Vortex, and MuJoCo in a real-world robotic setup. The accuracy is defined as the accumulated mean-squared error between the Cartesian ground truth position, tracked by a motion capture system, and the simulators&#x2019; prediction. Based on this measure, they conclude that simulators are able to model the control and kinematics accurately, but show deficits during dynamic robot-object interactions.</p>
<p>To obtain a quantitative estimate of the transferability, <xref ref-type="bibr" rid="B186">Zhang et al. (2020)</xref> suggest to learn a probabilistic dynamics model which is evaluated on a static set of target domain trajectories. This dynamics model is trained jointly with the policy in the same randomized simulator. The transferability score is chosen to be the average negative log-likelihood of the model&#x2019;s output given temporal state differences from the real-world trajectories. Thus, the proposed method requires a set of pre-recorded target domain trajectories, and makes the assumption that for a given domain the model&#x2019;s prediction accuracy correlates with the policy performance.</p>
<p>With robot navigation in mind, <xref ref-type="bibr" rid="B82">Kadian et al. (2020)</xref> define the Sim-vs-Real Correlation Coefficient (SRCC) to be the Pearson correlation coefficient on data pairs of scalar performance metrics. The data pairs consist of the policy performance achieved in a simulator instance as well as in the real counterpart. Therefore, in contrast to the SOB (<xref ref-type="bibr" rid="B127">Muratore et al., 2018</xref>), the SRCC requires real-world rollouts. A high SRCC value, i.e., close to 1, predicts good transferability, while low values, i.e., close to 0, indicates that the agent is exploited the simulation during learning. <xref ref-type="bibr" rid="B82">Kadian et al. (2020)</xref> also report tuning the domain parameters with grid search to increase the SRCC. By using the Pearson correlation, the SRCC is restricted to linear correlation, which might not be a notable restriction in practice.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Relation of Sim-To-Real to Other Fields</title>
<p>There are several research areas that overlap with sim-to-real in robot learning, more specifically domain randomization (<xref ref-type="fig" rid="F2">Figure 2</xref>). In the following, we describe those that either share the same goal, or employ conceptually similar methods.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Topological overview of the sim-to-real research and a selection of related fields.</p>
</caption>
<graphic xlink:href="frobt-09-799893-g002.tif"/>
</fig>
<sec id="s4-1">
<title>4.1 Curriculum Learning</title>
<p>The key idea behind curriculum learning is to increase the sample efficiency by scheduling the training process such that the agent first encounters &#x201c;easier&#x201d; tasks and gradually progresses to &#x201c;harder&#x201d; ones. Hence, the agent can bootstrap from the knowledge it gained at the beginning, before learning to solve more difficult task instances. Widely known in supervised learning (<xref ref-type="bibr" rid="B18">Bengio et al., 2009</xref>; <xref ref-type="bibr" rid="B96">Kumar et al., 2010</xref>), curriculum learning has been applied to RL (<xref ref-type="bibr" rid="B8">Asada et al., 1996</xref>; <xref ref-type="bibr" rid="B55">Erez and Smart, 2008</xref>; <xref ref-type="bibr" rid="B88">Klink et al., 2019</xref>, <xref ref-type="bibr" rid="B87">2021</xref>). The connection between curriculum learning and domain randomization can be highlighted by viewing the task as a part of the domain, i.e., the MDP, rendering the task parameters a subspace of the domain parameters. From this point of view, the curriculum learning schedule describes how the domain parameter distribution is updated. There are several challenges to using a curriculum learning approach for sim-to-real transfer. Three such challenges are: 1) we can not always assume to have an assessment of the difficulty level of individual domain parameter configurations, 2) curriculum learning does not aim at finding solutions robust to model uncertainty, and 3) curriculum learning methods may require a target distribution which is not defined in the domain randomization setting. However, adjustments can be made to circumvent these problems. <xref ref-type="bibr" rid="B130">OpenAI et al. (2019)</xref> suggested a heuristic for the domain randomization schedule that increases the boundaries of each domain parameter individually until the return drops more than a predefined threshold. Executing this approach on a computing cluster, the authors managed to train a policy and a vision system which in combination solve a Rubik&#x2019;s cube with a tendon-driven robotic hand. Another intersection point of curriculum learning and sim-to-real transfer is the work by <xref ref-type="bibr" rid="B122">Morere et al. (2019)</xref>, where a hierarchical planning method for discrete domains with unknown dynamics is proposed. Learning abstract skills based on a curriculum enables the algorithm to outperform planning and RL baselines, even in domains with a very large number of possible states.</p>
</sec>
<sec id="s4-2">
<title>4.2 Meta Learning</title>
<p>Inspired by the human ability to quickly master new tasks by leveraging the knowledge extracted from solving other tasks, meta learning (<xref ref-type="bibr" rid="B155">Santoro et al., 2016</xref>; <xref ref-type="bibr" rid="B60">Finn et al., 2017</xref>) seeks to make use of prior experiences gained from conceptually similar tasks. The field of meta learning currently enjoys high popularity, leading to abundant follow-up work. <xref ref-type="bibr" rid="B63">Grant et al. (2018)</xref> for example casts meta learning as hierarchical Bayesian inference. Furthermore, the meta learning framework has been adapted to the RL setting (<xref ref-type="bibr" rid="B171">Wang et al., 2017</xref>; <xref ref-type="bibr" rid="B128">Nagabandi et al., 2019</xref>). The optimization over an ensemble of tasks can be translated to the optimization over an ensemble of domain instances, modeled by different MDPs (<xref ref-type="sec" rid="s2">Section 2</xref>). <italic>Via</italic> this duality one can view domain randomization as a special form of meta learning where the robot&#x2019;s task remains qualitatively unchanged but the environment varies. Thus, the tasks seen during the meta training phase are analogous to domain instances experienced earlier in the training process. However, when looking at the complete procedure, meta learning and domain randomization are fundamentally different. The goal of meta learning, i.e., <xref ref-type="bibr" rid="B60">Finn et al. (2017)</xref>, is to find a suitable set of initial weights, which when updated generalizes well to a new task. Domain randomization on the other hand strives to directly solve a single task, generalizing over domain instances.</p>
</sec>
<sec id="s4-3">
<title>4.3 Transfer Learning</title>
<p>The term transfer learning covers a wide range of machine learning research, aiming at using knowledge learned in the source domain to solve a task in the target domain. Rooted in classification, transfer learning is categorized in several subfields by for example differentiating 1) if labeled data is available in the source or target domain, and 2) if the tasks in both domains are the same (<xref ref-type="bibr" rid="B131">Pan and Yang, 2010</xref>; <xref ref-type="bibr" rid="B189">Zhuang et al., 2021</xref>). Domain adaptation is one of the resulting subfields, specifying the case where ground truth information is only available in the target domain which is not equal to the source domain while the task remains the same. Thus, domain adaptation methods are in general suitable to tackle sim-to-real problems. However, the research fields evolved at different times in different communities, with different goals in mind. The keyword &#x201c;sim-to-real&#x201d; specifically concerns regression and control problems where the focus lies on overcoming the mismatch between simulation and reality. In contrast, most domain adaptation techniques are not designed for a dynamical system as the target domain.</p>
</sec>
<sec id="s4-4">
<title>4.4 Knowledge Distillation</title>
<p>When executing a controller on a physical device operating at high frequencies, it is of utmost importance that the forward pass finishes with the given time frame. With deep Neural Network (NN) policies, and especially with ensembles of these, this requirement can become challenging to meet. Distilling the knowledge of a larger network into a smaller one reduces the evaluation time. Knowledge distillation (<xref ref-type="bibr" rid="B71">Hinton et al., 2015</xref>) has been successfully applied to several machine learning applications such as natural language processing (<xref ref-type="bibr" rid="B41">Cui et al., 2017</xref>), and object detection (<xref ref-type="bibr" rid="B32">Chen et al., 2017</xref>). In the context of RL, knowledge distillation techniques can be used to compress the learned behavior of one or more teachers into a single student (<xref ref-type="bibr" rid="B151">Rusu et al., 2016a</xref>). Based on samples generated by the teachers, the student is trained in a supervised manner to imitate them. This idea can be applied to sim-to-real robot learning in a straightforward manner, where the teachers can be policies optimal for specific domain instances (<xref ref-type="bibr" rid="B25">Brosseit et al., 2021</xref>). Complementarily, knowledge distillation has been applied to multitask learning (<xref ref-type="bibr" rid="B134">Parisotto et al., 2016</xref>; <xref ref-type="bibr" rid="B165">Teh et al., 2017</xref>), promising to improve sample efficiency when learning a new task. A technical comparison of policy distillation methods for RL is provided by <xref ref-type="bibr" rid="B45">Czarnecki et al. (2019)</xref>.</p>
</sec>
<sec id="s4-5">
<title>4.5 Distributional Robustness</title>
<p>The term robustness is overloaded with different meanings, such as the ability to (quickly) counteract external disturbances, or the resilience against uncertainties in the underlying model&#x2019;s parameters. The field of robust control aims at designing controllers that explicitly deal with these uncertainties (<xref ref-type="bibr" rid="B188">Zhou and Doyle, 1998</xref>). Within this field, distributional robust optimization is a framework to find the worst-case probabilistic model from a so-called ambiguity set, and subsequently set a policy which acts optimally in this worst case. Mathematically, the problem is formulated as bilevel optimization, which is solved iteratively in practice. By restricting the model selection to the ambiguity set, distributional robust optimization regularizes the adversary to prevent the process from yielding solutions that are overly conservative policies. Under the lens of domain randomization, the ambiguity set closely relates to the distribution over domain parameters. <xref ref-type="bibr" rid="B1">Abdulsamad et al. (2021)</xref> for example define the ambiguity set as a Kullback-Leibler (KL) ball the nominal distribution. Other approaches use a moment-based ambiguity set (<xref ref-type="bibr" rid="B51">Delage and Ye, 2010</xref>) or introduce chance constrains (<xref ref-type="bibr" rid="B170">Van Parys et al., 2016</xref>). For a review of distributional robust optimization, see <xref ref-type="bibr" rid="B187">Zhen et al. (2021)</xref>. <xref ref-type="bibr" rid="B30">Chatzilygeroudis et al. (2020)</xref> point out that performing policy search under an uncertain model is equivalent to finding a policy that can perform well under various dynamics models. Hence, they argue that &#x201c;model-based policy search with probabilistic models is performing something similar to dynamics randomization.&#x201d;</p>
</sec>
<sec id="s4-6">
<title>4.6 System Identification</title>
<p>The goal of system identification is to find the set of model parameters which fit the observed data best, typically by minimizing the prediction-dependent loss such as the mean-squared error. Since the simulator is the pivotal element in every domain randomization method, the accessible parameters and their nominal values are of critical importance. When a manufacturer does not provide data for all model parameters, or when an engineer wants to deploy a new model, system identification is typically the first measure to obtain an estimate of the domain parameters. In principle, a number of approaches can be applied depending on the assumptions on the internal structure of the simulator. The earliest approaches in robotics recognized the linearity of the rigid body dynamics with respect to combinations of physics parameters such as masses, moments of inertia, and link lengths, thus proposed to use linear regression (<xref ref-type="bibr" rid="B10">Atkeson et al., 1986</xref>), and later Bayesian linear regression (<xref ref-type="bibr" rid="B167">Ting et al., 2006</xref>). However, it was quickly observed that the inferred parameters may be physically implausible, leading to the development of methods that can account for this (<xref ref-type="bibr" rid="B166">Ting et al., 2011</xref>). With the advent of deep learning, such structured physics-based approaches have been enhanced with NNs, yielding nonlinear system identification methods such as the ones based on the Newton-Euler forward dynamics (<xref ref-type="bibr" rid="B160">Sutanto et al., 2020</xref>; <xref ref-type="bibr" rid="B108">Lutter et al., 2021b</xref>). Alternatively, the simulator can be augmented with a NN to learn the domain parameter residuals, minimizing the one step prediction error (<xref ref-type="bibr" rid="B3">Allevato et al., 2019</xref>). On another front, system identification based on the classification loss between simulated and real samples has been investigated (<xref ref-type="bibr" rid="B53">Du et al., 2021</xref>; <xref ref-type="bibr" rid="B80">Jiang et al., 2021</xref>). System identification can also be interpreted as an episodic RL problem by treating the trajectory mismatch as the cost function and iteratively updating a distribution over models (<xref ref-type="bibr" rid="B31">Chebotar et al., 2019</xref>). Recent simulation-based inference methods yield highly expressive posterior distributions that capture multi-modality as well as correlations between the domain parameters (<xref ref-type="sec" rid="s4-8">Section 4.8</xref>).</p>
</sec>
<sec id="s4-7">
<title>4.7 Adaptive Control</title>
<p>The well-established field of adaptive control is concerned with the problem of adapting a controller&#x2019;s parameters at runtime to operate initially uncertain or varying systems (e.g., aircraft reaching supersonic speed). A prominent method is model reference adaptive control, which tracks a reference model&#x2019;s output specifying the desired closed-loop behavior. Model Identification Adaptive Control (MIAC) is a different variant, which includes an online system identification component that continuously estimates the system&#x2019;s parameters based on the prediction error of the output signal (<xref ref-type="bibr" rid="B9">&#xc5;str&#xf6;m and Wittenmark, 2008</xref>; <xref ref-type="bibr" rid="B98">Landau et al., 2011</xref>). Given the identified system, the controller is updated subsequently. Similarly, there exists a line of sim-to-real reinforcement learning approaches that condition the policy on the estimated domain parameters (<xref ref-type="bibr" rid="B184">Yu et al., 2017</xref>, <xref ref-type="bibr" rid="B183">2019b</xref>; <xref ref-type="bibr" rid="B123">Mozifian et al., 2020</xref>) or a latent representation thereof (<xref ref-type="bibr" rid="B182">Yu et al., 2019a</xref>; <xref ref-type="bibr" rid="B19">Peng et al., 2020</xref>; <xref ref-type="bibr" rid="B95">Kumar et al., 2021</xref>). The main difference to MIAC lies in the adaption mechanism. Adaptive control techniques typically define the parameters&#x2019; gradient proportional to the prediction error, while the approaches referenced above make the domain parameters an input to the policy.</p>
</sec>
<sec id="s4-8">
<title>4.8 Simulation-Based Inference</title>
<p>Simulators are predominantly used as forward models, i.e., to make predictions. However, with the increasing fidelity and expressiveness of simulators, there is a growing interest to also use them for probabilistic inference (<xref ref-type="bibr" rid="B40">Cranmer et al., 2020</xref>). In the case of simulation-based inference, the simulator and its parameters define the statistical model. Inference tasks differ by the quantity to be inferred. Regarding sim-to-real transfer, the most frequent task is to infer the simulation parameters from real-world time series data. Similarly to system identification (<xref ref-type="sec" rid="s4-6">Section 4.6</xref>), the result can be a point estimate, or a posterior distribution. Likelihood-Free Inference (LFI) methods are a type of simulation-based inference approaches which are particularly well-suited when we can make very little assumptions about the underlying generative model, treating it as an implicit function. These approaches only require samples from the model (e.g., a non-differentiable black-box simulator) and a measure of how likely real observations could have been generated from the simulator. Approximate Bayesian computation is well-known class of LFI methods that applies Monte Carlo sampling to infer the parameters by comparing summary statistics of synthetically generated and observed data. There exist plenty of variants for approximate Bayesian computation (<xref ref-type="bibr" rid="B112">Marjoram et al., 2003</xref>; <xref ref-type="bibr" rid="B17">Beaumont et al., 2009</xref>; <xref ref-type="bibr" rid="B159">Sunn&#xe5;ker et al., 2013</xref>) as well as studies on the design of low-dimensional summary statistics (<xref ref-type="bibr" rid="B58">Fearnhead and Prangle, 2012</xref>). In order to increase the efficiency and thereby scale LFI higher-dimensional problems, researchers investigated amortized approaches, which conduct the inference over multiple sequential rounds. Sequential neural posterior estimation approaches (<xref ref-type="bibr" rid="B132">Papamakarios and Murray, 2016</xref>; <xref ref-type="bibr" rid="B104">Lueckmann et al., 2017</xref>; <xref ref-type="bibr" rid="B64">Greenberg et al., 2019</xref>) approximate the conditional posterior, allowing for direct sampling from the posterior. Learning the likelihood (<xref ref-type="bibr" rid="B133">Papamakarios et al., 2019</xref>) can be useful in the context for hypothesis testing. Alternatively, posterior samples can be generated from likelihood-ratios (<xref ref-type="bibr" rid="B54">Durkan et al., 2020</xref>; <xref ref-type="bibr" rid="B70">Hermans et al., 2020</xref>). However, simulation-based inference does not explicitly consider policy optimization or domain randomization. Recent approaches connected all three techniques, and closed the reality gap by inferring a distribution over simulators while training policies in simulation (<xref ref-type="bibr" rid="B146">Ramos et al., 2019</xref>; <xref ref-type="bibr" rid="B13">Barcelos et al., 2020</xref>; <xref ref-type="bibr" rid="B126">Muratore et al., 2021c</xref>).</p>
</sec>
</sec>
<sec id="s5">
<title>5 Domain Randomization for Sim-To-Real Transfer</title>
<p>We distinguish between static (<xref ref-type="sec" rid="s5-1">Section 5.1</xref>), adaptive (<xref ref-type="sec" rid="s5-2">Section 5.2</xref>), and adversarial (<xref ref-type="sec" rid="s5-3">Section 5.3</xref>) domain randomization (<xref ref-type="fig" rid="F3">Figure 3</xref>). Static, as well as adaptive, methods are characterized by randomly sampling a set of domain parameters <bold>
<italic>&#x3be;</italic>
</bold> &#x223c; <italic>p</italic>(<bold>
<italic>&#x3be;</italic>
</bold>) at the beginning of each simulated rollout. A randomization scheme is categorized as adaptive if the domain parameter distribution is updated during learning, otherwise the scheme is called static. The main advantage of adaptive schemes is that they alleviate the need for hand-tuning the distributions of the domain parameters, which is currently a decisive part of the hyper-parameter search in a static scheme. Nonetheless, the prior distributions still demand design decisions. On the downside, every form of adaptation requires data from the target domain, typically the real robot, which is significantly more expensive to obtain. Another approach for learning robust policies in simulation is to apply adversarial disturbances during the training process. We classify these perturbations as a form of domain randomization, since they either depend on a highly stochastic adversary learned jointly with the policy, or directly contain a random process controlling the application of the perturbation. Adversarial approaches may yield exceptionally robust control strategies. However, without any further restrictions, it is always possible to create scenarios in which the protagonist agent can never win, i.e., the policy can not learn the task. Balancing the adversary&#x2019;s power is pivotal to an adversarial domain randomization method, adding a sensitive hyper-parameter.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Topological overview of domain randomization methods.</p>
</caption>
<graphic xlink:href="frobt-09-799893-g003.tif"/>
</fig>
<p>Another way to distinguish domain randomization concepts is the representation of the domain parameter distribution. The vast majority of algorithms assume a specific probability distribution (e.g., normal or uniform) independently for every parameter. This modeling decision has the benefit of greatly reducing the complexity, but at the same time severely limits the expressiveness. Novel LFI methods (<xref ref-type="sec" rid="s5-2">Section 5.2</xref>) estimate the complete posterior, hence allow the recognition of correlations between the domain parameters, multi-modality, and skewness.</p>
<sec id="s5-1">
<title>5.1 Static Domain Randomization</title>
<p>Approaches that sample from a fixed domain parameter distribution typically aim at performing sim-to-real transfer without using any real-world data (<xref ref-type="fig" rid="F4">Figure 4</xref>). Since running the policy on a physical device is generally the most difficult and time-consuming part, static approaches promise quick and relatively easy to obtain results. In terms of final policy performance in the target domain, these methods are usually inferior to those that adapt the domain parameter distribution. Nevertheless, static domain randomization has bridged the reality gap in several cases.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Conceptual illustration of static domain randomization.</p>
</caption>
<graphic xlink:href="frobt-09-799893-g004.tif"/>
</fig>
<sec id="s5-1-1">
<title>5.1.1 Randomizing Dynamics Without Using Real-World Data at Runtime</title>
<p>More than a decade ago, <xref ref-type="bibr" rid="B172">Wang et al. (2010)</xref> proposed to randomize the simulator in which the training data is generated. The authors examined the randomization of initial states, external disturbances, goals, and actuator noise, clearly showing an improved robustness of the learned locomotion controllers in simulated experiments (sim-to-sim). <xref ref-type="bibr" rid="B121">Mordatch et al. (2015)</xref> used a finite model ensembles to run (offline) trajectory optimization on a small-scale humanoid robot, achieving one of the first sim-to-real transfers in robotics powered by domain randomization. Similarly, <xref ref-type="bibr" rid="B103">Lowrey et al. (2018)</xref> employed the Natural Policy Gradient (<xref ref-type="bibr" rid="B84">Kakade, 2001</xref>) to learn a continuous controller for a three-finger positioning task, after carefully identifying the system&#x2019;s parameters. Conforming with <xref ref-type="bibr" rid="B121">Mordatch et al. (2015)</xref>, their results showed that the policy learned from the identified model was able to perform the sim-to-real transfer, but the policies learned from an ensemble of models was more robust to modeling errors. In contrast, <xref ref-type="bibr" rid="B136">Peng et al. (2018)</xref> combined model-free RL with recurrent NN policies that were trained using hindsight experience replay (<xref ref-type="bibr" rid="B5">Andrychowicz et al., 2017</xref>) in order to push an object by controlling a robotic arm. <xref ref-type="bibr" rid="B164">Tan et al. (2018)</xref> presented an example for learning quadruped gaits from randomized simulations, where particular efforts were made to conduct a prior system identification. They empirically found that sampling domain parameters from a uniform distribution together with applying random forces and regularizing the observation space can be enough to cross the reality gap. For quadrotor control, <xref ref-type="bibr" rid="B120">Molchanov et al. (2019)</xref> trained feedforward NN policies which generalize over different physical drones. The suggested randomization includes a custom model for motor lag and noise based on an Ornstein-Uhlenbeck process. <xref ref-type="bibr" rid="B144">Rajeswaran et al. (2017)</xref> explored the use of a risk-averse objective function, optimizing a lower quantile of the return. The method was only evaluated on simulated MuJoCo tasks, however it was also one of the first methods that draws upon the Bayesian perspective. Moreover, this approach was employed as a baseline by <xref ref-type="bibr" rid="B125">Muratore et al. (2021b)</xref>, who introduced a measure for the inter-domain transferability of controllers together with a risk-neutral randomization scheme. The resulting policies have the unique feature of providing a (probabilistic) guarantee on the estimated transferability and managed to directly transfer to the real platform in two different experiments. <xref ref-type="bibr" rid="B157">Siekmann et al. (2021)</xref> achieved the sim-to-real transfer of a recurrent NN policy for bipedal walking. The policy was trained using model-free RL in simulation with uniformly distributed dynamics parameters as well as randomized task-specific terrain. According to the authors, the recurrent architecture and the terrain randomization were pivotal.</p>
</sec>
<sec id="s5-1-2">
<title>5.1.2 Randomizing Dynamics Using Real-World Data at Runtime</title>
<p>The work by <xref ref-type="bibr" rid="B42">Cully et al. (2015)</xref> can be seen as both static and adaptive domain randomization, where a large set of hexapod locomotion policies is learned before execution on the physical robot, and subsequently evaluated in simulation. Every policy is associated with one configuration of the so-called behavioral descriptors, which can be interpreted as domain parameters. Instead of retraining or fine-tuning, the proposed algorithm reacts to performance drops, e.g., due to damage, by querying Bayesian Optimization (BO) to sequentially select one of the pretrained policies and measure its performance on the robot. Instead of randomizing the simulator parameters, <xref ref-type="bibr" rid="B44">Cutler and How (2015)</xref> explored learning a probabilistic model, chosen to be a GP, of the environment using data from both simulated and real-world dynamics. A key feature of this method is to incorporate the simulator as a prior for the probabilistic model, and subsequently use this information of the policy updates with PILCO (<xref ref-type="bibr" rid="B50">Deisenroth and Rasmussen, 2011</xref>). The authors demonstrated policy transfer for a inverted pendulum task. In follow-up work, <xref ref-type="bibr" rid="B43">Cutler and How (2016)</xref> extended the algorithm to make a remote-controlled toy car learn how to drift in circles. <xref ref-type="bibr" rid="B7">Antonova et al. (2019)</xref> propose a sequential Variational AutoEncoder (VAE) to embed trajectories into a compressed latent space which is used with BO to search for controllers. The VAE and the domain-specific high-level controllers are learned jointly, while the randomization scheme is left unchanged. Leveraging a custom kernel which measures the KL divergence between trajectories and the data efficiency of BO, the authors report successful sim-to-real transfers after 10 target domain trials for a hexapod locomotion task as well as 20 trials for a manipulation task. <xref ref-type="bibr" rid="B95">Kumar et al. (2021)</xref> learned a quadruped locomotion policy that passed joint positions to a lower level PD controller without using any real-wold data. The essential components of this approach are the encoder that projects the domain parameters to a latent space and the adaption module which is trained to regress the latent state from the recent history of measured states and actions. The policy is conditioned on the current state, the previous actions, and the latent state which needs to be reconstructed during deployment in the physical world. Emphasizing the importance of the carefully engineered reward function, the authors demonstrate the method&#x2019;s ability to transfer from simulation to various outdoor terrains.</p>
</sec>
<sec id="s5-1-3">
<title>5.1.3 Randomizing Visual Appearance and Configurations</title>
<p>
<xref ref-type="bibr" rid="B168">Tobin et al. (2017)</xref> learned an object detector for robot grasping using a fixed domain parameter distribution, and bridged the gap with a deep NN policy trained exclusively on simulated RGB images. Similarly, <xref ref-type="bibr" rid="B78">James et al. (2017)</xref> added various distracting shapes as well as structured noise (<xref ref-type="bibr" rid="B137">Perlin, 2002</xref>) when learning a robot manipulation task with an end-to-end controller that mapped pixels to motor velocities. The approach presented by <xref ref-type="bibr" rid="B139">Pinto et al. (2018)</xref> combines the concepts of static domain randomization and actor-critic training (<xref ref-type="bibr" rid="B101">Lillicrap et al., 2016</xref>), enabling the direct sim-to-real transfer of the abilities to pick, push, or move objects. While the critic has access to the simulator&#x2019;s full state, the policy only receives images of the environment, creating an information asymmetry. <xref ref-type="bibr" rid="B114">Matas et al. (2018)</xref> used the asymmetric actor-critic idea from <xref ref-type="bibr" rid="B139">Pinto et al. (2018)</xref> as well as several other improvements to train a deep NN policy end-to-end, seeded with prior demonstrations. Solving three variations of a tissue folding task, this work scales sim-to-real visuomotor manipulation to deformable objects. Purely visual domain randomization has also been applied to aerial robotics, where <xref ref-type="bibr" rid="B154">Sadeghi and Levine (2017)</xref> achieved sim-to-real transfer for learning to fly a drone through indoor environments. The resulting deep NN policy was able to map from monocular images to normalized 3D drone velocities. Similarly, <xref ref-type="bibr" rid="B141">Polvara et al. (2020)</xref> demonstrated landing of a quadrotor trained in end-to-end fashion using randomized environments. <xref ref-type="bibr" rid="B47">Dai et al. (2019)</xref> investigated the effect of domain randomization on visuomotor policies, and observed that this leads to more redundant and entangled representations accompanied with significant statistical changes in the weights. <xref ref-type="bibr" rid="B180">Yan et al. (2020)</xref> apply Model Predictive Control (MPC) to manipulate of deformable objects using a forward model based on visual input. The novelty of this approach is that the predictive model is trained jointly with an embedding to minimizing a contrastive loss (<xref ref-type="bibr" rid="B169">van den Oord et al., 2018</xref>) in the latent space. Finally, domain randomization was applied to transfer the behavior from simulation to the real robot.</p>
</sec>
<sec id="s5-1-4">
<title>5.1.4 Randomizing Dynamics, Randomizing Visual Appearance, and Configurations</title>
<p>Combining Generative Adversarial Networks (GANs) and domain randomization, <xref ref-type="bibr" rid="B21">Bousmalis et al. (2018)</xref> greatly reduced the number of necessary real-world samples for learning a robotic grasping task. The essence of their method is to transform simulated monocular RGB images in a way that is closely matched to the real counterpart. Extensive evaluation on the physical robot showed that domain randomization as well as the suggested pixel-level domain adaptation technique were important to successfully transfer. Despite the pixel-level domain adaptation technique being learned, the policy optimization in simulation is done with a fixed randomization scheme. In related work <xref ref-type="bibr" rid="B79">James et al. (2019)</xref> train a GAN to transform randomized images to so-called canonical images, such that a corresponding real image would be transformed to the same one. This approach allowed them to train purely from simulated images, and optionally fine-tune the policy on target domain data. Notably, the robotic in-hand manipulation conducted by <xref ref-type="bibr" rid="B6">OpenAI et al. (2020)</xref> demonstrated that domain randomization in combination with careful model engineering and the usage of recurrent NNs enables sim-to-real transfer on an unprecedentedly difficulty level.</p>
</sec>
</sec>
<sec id="s5-2">
<title>5.2 Adaptive Domain Randomization</title>
<p>Static domain randomization (<xref ref-type="sec" rid="s5-1">Section 5.1</xref>) is inherently limited and implicitly assumes knowledge of the true mean of the domain parameters or accepts biased samples (<xref ref-type="fig" rid="F5">Figure 5</xref>). Adapting the randomization scheme allows the training to narrow or widen the search distribution in order to fulfill one or multiple criteria which can be chosen freely. The mechanism devised for updating the domain parameter distribution as well as the procedure to collect meaningful target domain data are typically the center piece of adaptive randomization algorithms. In this process the execution of intermediate policies on the physical device is the most likely point of failure. However, approaches that update the distribution solely based on data from the source domain are less flexible and generally less effective.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Conceptual illustration of adaptive domain randomization.</p>
</caption>
<graphic xlink:href="frobt-09-799893-g005.tif"/>
</fig>
<sec id="s5-2-1">
<title>5.2.1 Conditioning Policies on the Estimated Domain Parameters</title>
<p>
<xref ref-type="bibr" rid="B184">Yu et al. (2017)</xref> suggested the use of a NN policy that is conditioned on the state and the domain parameters. Since these parameters are not assumed to be known, they have to be estimated, e.g., with online system identification. For this purpose, a second NN is trained to regress the domain parameters from the observed rollouts. By applying this approach to simulated continuous control tasks, the authors showed that adding the online system identification module can enable an adaption to sudden changes in the environment. In subsequent research, <xref ref-type="bibr" rid="B182">Yu et al. (2019a)</xref> intertwined policy optimization, system identification, and domain randomization. The proposed method first identifies bounds on the domain parameters which are later used for learning from the randomized simulator. In a departure from their previous approach, the policy is conditioned on a latent space projection of the domain parameters. After training in simulation, a second system identification step runs BO for a fixed number of iterations to find the most promising projected domain parameters. The algorithm was evaluated on sim-to-real bipedal robot walking. <xref ref-type="bibr" rid="B123">Mozifian et al. (2020)</xref> also introduce a dependence of the policy w.r.t. to the domain parameters. These are updated by gradient ascent on the average return over domains, regularized by a penalty proportional to the KL divergence. Similar to <xref ref-type="bibr" rid="B149">Ruiz et al. (2019)</xref>, the authors update the domain parameter distribution using the score function gradient estimator. <xref ref-type="bibr" rid="B123">Mozifian et al. (2020)</xref> tested their method on sim-to-sim robot locomotion tasks. It remains unclear whether this approach scales to sim-to-real scenarios since the adaptation is done based on the return obtained in simulation, thus is not physically grounded. Bootstrapping from pre-recorded motion capture data of animals, <xref ref-type="bibr" rid="B19">Peng et al. (2020)</xref> learned quadruped locomotion skills with a synthesis of imitation learning, domain randomization, and domain adaptation (<xref ref-type="sec" rid="s4-3">Section 4.3</xref>). The introduced method is conceptually related to the approach of <xref ref-type="bibr" rid="B183">Yu et al. (2019b)</xref>, but adds an information bottleneck. According to the authors, this bottleneck is necessary because without it, the policy has access to the underlying dynamics parameters and becomes overly dependent on them, which leads to brittle behavior. To avoid this overfitting, <xref ref-type="bibr" rid="B19">Peng et al. (2020)</xref> limit the mutual information between the domain parameters and their encoding, realized as penalty on the KL divergence from a zero-mean Gaussian prior on the latent variable.</p>
</sec>
<sec id="s5-2-2">
<title>5.2.2 The Bilevel Optimization Perspective</title>
<p>
<xref ref-type="bibr" rid="B124">Muratore et al. (2021a)</xref> formulated adaptive domain randomization as a bilevel optimization that consists of an upper and a lower level problem. In this framework, the upper level is concerned with finding the domain parameter distribution, which when used for training in simulation leads to a policy with maximal real-world return. The lower level problem seeks to find a policy in the current randomized source domain. Using BO for the upper level and model-free RL for the lower level, <xref ref-type="bibr" rid="B124">Muratore et al. (2021a)</xref> compare their method in two underactuated sim-to-real robotic tasks against two baselines. Picturing the real-world return analogous to the probability for optimality, this approach reveals parallels to control as inference (<xref ref-type="bibr" rid="B148">Rawlik et al., 2012</xref>; <xref ref-type="bibr" rid="B99">Levine and Koltun, 2013</xref>; <xref ref-type="bibr" rid="B173">Watson et al., 2021</xref>), where the control variates are the parameters of the domain distribution. BO has also been employed by <xref ref-type="bibr" rid="B135">Paul et al. (2019)</xref> to adapt the distribution of domain parameters such that using these for the subsequent training maximizes the policy&#x2019;s return. Their method models the relation between the current domain parameters, the current policy and the return of the updated policy with a GP. Choosing the domain parameters that maximize the return in simulation is critical, since this creates the possibility to adapt the environment such that it is easier for the agent to solve. This design decision requires the policy parameters to be fed into the GP which is prohibitively expensive if the full set of parameters are used. Therefore, abstractions of the policy, so-called fingerprints, are created. These handcrafted features, e.g., a Gaussian approximation of the stationary state distribution, replace the policy to reduce the input dimension. <xref ref-type="bibr" rid="B135">Paul et al. (2019)</xref> tested the suggested algorithm on three sim-to-sim tasks, focusing on the handling of so-called significant rare events. Embedding the domain parameters into the mean function of a GP which models the system dynamics, <xref ref-type="bibr" rid="B28">Chatzilygeroudis and Mouret (2018)</xref> extended a black-box policy search algorithm (<xref ref-type="bibr" rid="B29">Chatzilygeroudis et al., 2017</xref>) with a simulator as prior. The approach explicitly searches for parameters of the simulator that fit the real-world data in an upper level loop, while optimizing the GP&#x2019;s hyper-parameters in a lower level loop. This method allowed a damage hexapod robot to walk in less than 30&#xa0;s. <xref ref-type="bibr" rid="B149">Ruiz et al. (2019)</xref> proposed a meta-algorithm which is based on a bilevel optimization problem and updates the domain parameter distribution using REINFORCE (<xref ref-type="bibr" rid="B174">Williams, 1992</xref>). The approach has been evaluated in simulation on synthetic data, except for a semantic segmentation task. Thus, there was no dynamics-dependent interaction of the learned policy with the real world. <xref ref-type="bibr" rid="B116">Mehta et al. (2019)</xref> also formulated the adaption of the domain parameter distribution as an RL problem where different simulation instances are sampled and compared against a reference environment based on the resulting trajectories. This comparison is done by a discriminator which yields rewards proportional to the difficulty of distinguishing the simulated and real environments, hence providing an incentive to generate distinct domains. Using this reward signal, the domain parameters of the simulation instances are updated <italic>via</italic> Stein Variational Policy Gradient (<xref ref-type="bibr" rid="B102">Liu et al., 2017</xref>). <xref ref-type="bibr" rid="B116">Mehta et al. (2019)</xref> evaluated their method in a sim-to-real experiment where a robotic arm had to reach a desired point. In contrast, <xref ref-type="bibr" rid="B31">Chebotar et al. (2019)</xref> presented a trajectory-based framework for closing the reality gap, and validated it on two sim-to-real robotic manipulation tasks. The proposed procedure adapts the domain parameter distribution&#x2019;s parameters by minimizing discrepancy between observations from the real-world system and the simulation. To measure the discrepancy, <xref ref-type="bibr" rid="B31">Chebotar et al. (2019)</xref> use a linear combination of the <italic>L</italic>
<sup>1</sup> and <italic>L</italic>
<sup>2</sup> norm between simulated and real trajectories. These values are then plugged in as costs for Relative Entropy Policy Search (REPS) (<xref ref-type="bibr" rid="B138">Peters et al., 2010</xref>) to update the simulator&#x2019;s parameters, hence turning the simulator identification into an episodic RL problem. The policy optimization was done using Proximal Policy Optimization (PPO) (<xref ref-type="bibr" rid="B156">Schulman et al., 2017</xref>), a step-based model-free RL algorithm.</p>
</sec>
<sec id="s5-2-3">
<title>5.2.3 Removing Restrictions on the Domain Parameter Distribution</title>
<p>
<xref ref-type="bibr" rid="B146">Ramos et al. (2019)</xref> perform a fully Bayesian treatment of the simulator&#x2019;s parameters by employing Likelihood-Free Inference (LFI) with a Mixture Density Network (MDN) as model for the density estimator. Analyzing the obtained posterior over domain parameters, they showed that the proposed method is, in a sim-to-sim scenario, able to simultaneously infer different parameter configurations which can explain the observed trajectories. An evaluation over a gird of domain parameters confirms that the policies trained with the inferred posterior are more robust model uncertainties. The key benefit over previous approaches is that the domain parameter distribution is not restricted to belong to a specific family, e.g., normal or uniform. Instead, the true posterior is approximated by the density estimator, fitted using LFI (<xref ref-type="bibr" rid="B132">Papamakarios and Murray, 2016</xref>). In follow-up work, <xref ref-type="bibr" rid="B142">Possas et al. (2020)</xref> addressed the problem of learning the behavioral policies which are required for the collection of target domain data. By describing the integration policy optimization <italic>via</italic> model-free RL, the authors created an online variant of the original method. The sim-to-real experiments were carried out using MPC where (only) the model parameters are updated based on the result from the LFI routine. <xref ref-type="bibr" rid="B115">Matl et al. (2020)</xref> scaled the Bayesian inference procedure of <xref ref-type="bibr" rid="B146">Ramos et al. (2019)</xref> to the simulation of granular media, estimating parameters such as friction and restitution coefficients. <xref ref-type="bibr" rid="B13">Barcelos et al. (2020)</xref> presented a method that interleaves domain randomization, LFI, and policy optimization. The controller is updated <italic>via</italic> nonlinear MPC while using the unscented transform to simulate different domain instances for the control horizon. Hence, this algorithm allows one to calibrate the uncertainty as the system evolves with the passage of time, attributing higher costs to more uncertain paths. For performing the essential LFI, the authors build upon the work of <xref ref-type="bibr" rid="B146">Ramos et al. (2019)</xref> to identify the posterior domain parameters, which are modeled by a mixture of Gaussians. The approach was validated on a simulated inverted pendulum swing-up task as well as a real trajectory following task using a wheeled robot. Since the density estimation problem is the center piece of LFI-based domain randomization, improving the estimator&#x2019;s flexibility is of great interest. <xref ref-type="bibr" rid="B126">Muratore et al. (2021c)</xref> employed a sequential neural posterior estimation algorithm (<xref ref-type="bibr" rid="B64">Greenberg et al., 2019</xref>) which uses normalizing flows to estimate the (conditional) posterior over simulators. In combination with a segment-wise synchronization between the simulations and the recorded real-world trajectories, <xref ref-type="bibr" rid="B126">Muratore et al. (2021c)</xref> demonstrated the neural inference method&#x2019;s ability to learn the posterior belief over contact-rich black-box simulations. Moreover, the proposed approach was evaluated with policy optimization in the loop on an underactuated swing-up and balancing task, showing improved results compared to BayesSim (<xref ref-type="bibr" rid="B146">Ramos et al., 2019</xref>) as well as Bayesian linear regression.</p>
</sec>
</sec>
<sec id="s5-3">
<title>5.3 Adversarial Domain Randomization</title>
<p>Extensive prior studies have shown that deep NN classifiers are vulnerable to imperceptible perturbations their inputs, obtained <italic>via</italic> adversarial optimization, leading to significant drops in accuracy (<xref ref-type="bibr" rid="B163">Szegedy et al., 2014</xref>; <xref ref-type="bibr" rid="B57">Fawzi et al., 2015</xref>; <xref ref-type="bibr" rid="B62">Goodfellow et al., 2015</xref>; <xref ref-type="bibr" rid="B97">Kurakin et al., 2017</xref>; <xref ref-type="bibr" rid="B74">Ilyas et al., 2019</xref>). This line of research has been extended to reinforcement learning, showing that small (adversarial) perturbations are enough to significantly degrade the policy performance (<xref ref-type="bibr" rid="B73">Huang et al., 2017</xref>). To defend against such attacks, the training data can be augmented with adversarially-perturbed examples, or the adversarial inputs can be detected and neutralized at test-time (<xref ref-type="fig" rid="F6">Figure 6</xref>). However, studies of existing defenses have shown that adversarial examples are harder to detect than originally believed (<xref ref-type="bibr" rid="B27">Carlini and Wagner, 2017</xref>). It is safe to assume that this insight gained from computer vision problems transfers to the RL setting, on which we focus here.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Conceptual illustration of adversarial domain randomization.</p>
</caption>
<graphic xlink:href="frobt-09-799893-g006.tif"/>
</fig>
<sec id="s5-3-1">
<title>5.3.1 Adversary Available Analytically</title>
<p>
<xref ref-type="bibr" rid="B110">Mandlekar et al. (2017)</xref> proposed physically plausible perturbations by randomly deciding when to add a scaled gradient of the expected return w.r.t. the state. Their sim-to-sim evaluation on four MuJoCo tasks showed that agents trained with the suggested adversarial randomization generalize slightly better to domain parameter configurations than agents trained with a static randomization scheme. <xref ref-type="bibr" rid="B106">Lutter et al. (2021a)</xref> derived the optimal policy together with different optimal disturbances from the value function in a continuous state, action, and time RL setting. Despite outstanding sim-to-real transferability of the resulting policies, the presented approach is conceptually restricted by assuming access to a compact representation of the state domain, typically obtained through exhaustive sampling, which hinders the scalability to high-dimensional tasks.</p>
</sec>
<sec id="s5-3-2">
<title>5.3.2 Adversary Learned <italic>via</italic> Two-Player Games</title>
<p>Domain randomization can be described using a game theoretic framework. Focusing on two-player games for model-based RL, <xref ref-type="bibr" rid="B145">Rajeswaran et al. (2020)</xref> define a &#x201c;policy player&#x201d; which maximizes rewards in the learned model and a &#x201c;model player&#x201d; which minimizes prediction error of data collected by policy player. This formulation can be transferred to the sim-to-real scenario in different ways. One example is to make the &#x201c;policy player&#x201d; model-agnostic and to let the &#x201c;model player&#x201d; control the domain parameters. <xref ref-type="bibr" rid="B140">Pinto et al. (2017)</xref> introduced the idea of a second agent whose goal it is to hinder the first agent from fulfilling its task. This adversary has the ability to apply force disturbances at predefined locations of the robot&#x2019;s body, while the domain parameters remain unchanged. Both agents are trained in alternation using RL make this a zero-sum game. Similarly, <xref ref-type="bibr" rid="B185">Zhang et al. (2021)</xref> aim to train an agent using adversarial examples such that it becomes robust against test-time attacks. As in the approach presented by <xref ref-type="bibr" rid="B140">Pinto et al. (2017)</xref>, the adversary and the protagonist are trained alternately until convergence at every meta-iteration. Unlike prior work, <xref ref-type="bibr" rid="B185">Zhang et al. (2021)</xref> build on state-adversarial MDPs manipulating the observations but not the simulation state. Another key property of their approach is that the perturbations are applied after a projection to a bounded set. The proposed observation-based attack as well as training algorithm is supported by four sim-to-sim validations in MuJoCo environments. <xref ref-type="bibr" rid="B80">Jiang et al. (2021)</xref> employed GANs to distinguish between source and target domain dynamics, sharing the concept of a learned domain discriminator with <xref ref-type="bibr" rid="B116">Mehta et al. (2019)</xref>. Moreover, the authors proposed to augment an analytical physics simulator with a NN that is trained to maximize the similarity between simulated and real trajectories, turning the identification of the hybrid simulator into an RL problem. The comparison on a sim-to-real quadruped locomotion task showed an advantage over static domain randomization baselines. On the other hand, this method added noise to the behavioral policy in order to obtain diverse target domain trajectories for the simulator identification, which can be considered dangerous.</p>
</sec>
</sec>
</sec>
<sec id="s6">
<title>6 Discussion and Outlook</title>
<p>To conclude this review, we discuss practical aspects of choosing among the existing domain randomization approaches (<xref ref-type="sec" rid="s6-1">Section 6.1</xref>), emphasizing that sim-to-real transfer can also be achieved without randomizing (<xref ref-type="sec" rid="s6-2">Section 6.2</xref>). Finally, we sketch out several promising directions for future sim-to-real research (<xref ref-type="sec" rid="s6-3">Section 6.3</xref>).</p>
<sec id="s6-1">
<title>6.1 Choosing a Suitable Domain Randomization Approach</title>
<p>Every publication on sim-to-real robot learning presents an approach that surpasses its baselines. So, how should we select the right algorithm given a task? Up to now, there is no benchmark for sim-to-real methods based on the policy&#x2019;s target domain performance, and it is highly questionable if such a comparison could be fair, given that these algorithms have substantially different requirements and goals. The absence of one common benchmark is not necessarily bad, since bundling a set of environments to define a metric would bias research to pursue methods which optimize solely for that metric. A prominent example for this mechanism is the OpenAI Gym (<xref ref-type="bibr" rid="B22">Brockman et al., 2016</xref>), which became the <italic>de facto</italic> standard for RL. Contrarily, a similar development for sim-to-real research is not desirable since the overfitting to a small set of scenarios would be detrimental to the desired transferability and the vast amount of other scenarios.</p>
<p>When choosing from the published algorithms, the practitioner is advised to check if the approach has been tested on at least two different sim-to-real tasks, and if the (sometimes implicit) assumptions can be met. Adaptive domain randomization methods, for example, will require operating the physical device in order to collect real-world data. After all, we can expect that approaches with randomization will be more robust than the ones only trained on a nominal model. This has been shown consistently (<xref ref-type="sec" rid="s5">Section 5</xref>). However, we can not expect that these approaches work out of the box on novel problems without adjusting the hyper-parameters. Another starting point could be the set of sim-to-sim benchmarks released by <xref ref-type="bibr" rid="B117">Mehta et al. (2020)</xref>, targeting the problem of system identification for state-of-the-art domain randomization algorithms.</p>
</sec>
<sec id="s6-2">
<title>6.2 Sim-To-Real Transfer Without Domain Randomization</title>
<p>Domain randomization is one way to successfully transfer control policies learned in simulation to the physical device, but by no means the only way.</p>
<sec id="s6-2-1">
<title>6.2.1 Action Transformation</title>
<p>In order to cope with the inaccuracies of a simulator, <xref ref-type="bibr" rid="B34">Christiano et al. (2016)</xref> propose to train a deep inverse dynamics model to map the action commanded by policy to a transformed action. When applying the original action to the real system and the transformed action to the simulated system, they would lead to the same next robot state, thus bridging the reality gap. To generate the data for training the inverse dynamics model, preliminary policies are augmented with hand-tuned exploration noise and executed in the target domain. Their approach is based on the observation that a policy&#x2019;s high-level strategy remains valid after sim-to-real transfer, and assumes that the simulator provides a reasonable estimate of the next state. With the same goal in mind, <xref ref-type="bibr" rid="B68">Hanna and Stone (2017)</xref> suggest an action transformation that is learned such that applying the transformed actions in simulation has the same effects as applying the original actions had on the real system. At the core approach is the estimation of neural forward and inverse models based on rollouts executed with the real robot.</p>
</sec>
<sec id="s6-2-2">
<title>6.2.2 Novel Neural Policy Architectures</title>
<p>
<xref ref-type="bibr" rid="B153">Rusu et al. (2017)</xref> employ a progressively growing NN architecture (<xref ref-type="bibr" rid="B152">Rusu et al., 2016b</xref>) to learn an end-to-end approach mapping from pixels to discretized joint velocities. This NN framework enables the reuse of previously gained knowledge as well as the adaptation to new input modalities. The first part of the NN policy is trained in simulation, while the part added when transferring needs to be trained using real-world data. For a relatively simple reaching task, the authors reported requiring approximately 4&#xa0;h of runtime on the physical robot.</p>
</sec>
<sec id="s6-2-3">
<title>6.2.3 Identifying and Improving the Simulator</title>
<p>
<xref ref-type="bibr" rid="B178">Xie et al. (2019)</xref> describe an iterative process including motion tracking, system identification, RL, and knowledge distillation, to learn control policies for humanoid walking on the physical system. This way, the authors can rely on known building blocks resulting in initial and intermediate policies which are reasonably safe to execute. To run a policy on the real robot while learning without the risk of damaging or stopping the device, <xref ref-type="bibr" rid="B85">Kaspar et al. (2020)</xref> propose to combine operational space control and RL. After carefully identifying the simulator&#x2019;s parameters, the RL agent learns to control the end-effector <italic>via</italic> forces on a unit mass-spring-damper system. The constrains and nullspace behavior are abstracted away from the agent, making the RL problem easier and the policy more transferable.</p>
</sec>
</sec>
<sec id="s6-3">
<title>6.3 Promising Future Research Directions</title>
<p>Learning from randomized simulations still offers abundant possibilities to enable or improve the sim-to-real transfer of control policies. In the following section, we describe multiple opportunities for future work in this area of research.</p>
<sec id="s6-3-1">
<title>6.3.1 Real-To-Sim-To-Real Transfer</title>
<p>Creating randomizable simulation environments is time-intensive, and the initial guesses for the domain parameters as well as their variances are typically very inaccurate. It is of great interest to automate this process grounded by real-world data. One viable scenario could be to record an environment with a RGBD camera, and subsequently use the information to reconstruct the scene. Moreover, the recorded data can be processed to infer the domain parameters, which then specifies the domain parameter distributions. When devising such a framework, we could start from prior work on 3D scene reconstruction <xref ref-type="bibr" rid="B90">Kolev et al. (2009)</xref>, <xref ref-type="bibr" rid="B67">Haefner et al. (2018)</xref> as well as methods to estimate the degrees of freedom for rigid bodies (<xref ref-type="bibr" rid="B113">Martin-Martin and Brock, 2014</xref>). A data-based automatic generation of simulation environments (real-to-sim-to-real) not only promises to reduce the workload, but would also yields a meaningful initialization for domain distribution parameters.</p>
</sec>
<sec id="s6-3-2">
<title>6.3.2 Policy Architectures With Inductive Biases</title>
<p>tDeep NNs are by far the most common policy type, favored because of their flexibility and expressiveness. However, they are also brittle w.r.t. changes in their inputs (<xref ref-type="bibr" rid="B163">Szegedy et al., 2014</xref>; <xref ref-type="bibr" rid="B62">Goodfellow et al., 2015</xref>; <xref ref-type="bibr" rid="B73">Huang et al., 2017</xref>). Due to the inevitable domain shift in sim-to-real scenarios this input sensitivity is magnified. The success of domain randomization methods for robot learning can largely be attributed to their ability of regularizing deep NN policies by diversifying the training data. Generally, one may also introduce regularization to the learning by designing alternative models for the control policies, e.g., linear combination of features and parameters, (time varying) mixtures of densities, or movement primitives. All of these have their individual strengths and weaknesses. We believe that pairing the expressiveness of deep NNs with physically-grounded prior knowledge leads to controllers that achieve high performance and suffer less from transferring to the real world, since they are able to bootstrap from their prior. There are multiple ways to incorporate abstract knowledge about physics. We can for example restrict the policy to obey stable system dynamics derived from first principles (<xref ref-type="bibr" rid="B65">Greydanus et al., 2019</xref>; <xref ref-type="bibr" rid="B107">Lutter et al., 2019</xref>). Another approach is to design the model class such that the closed-loop system is passive for all parameterizations of the learned policy, thus guaranteeing stability in the sense of Lyapunov as well as bounded output energy given bounded input energy (<xref ref-type="bibr" rid="B23">Brogliato et al., 2007</xref>; <xref ref-type="bibr" rid="B181">Yang et al., 2013</xref>; <xref ref-type="bibr" rid="B46">Dai et al., 2021</xref>). All these methods would require significant exploration in the environment, making it even more challenging to learn successful controllers in the real-world directly. Leveraging randomized simulation is likely going to be a critical component in demonstrating solving sequential problems on real robots.</p>
</sec>
<sec id="s6-3-3">
<title>6.3.3 Towards Dual Control <italic>via</italic> Neural Likelihood-Free Inference</title>
<p>Continuing the direction of adaptive domain randomization, we are convinced that neural LFI powered by normalizing flows are auspicious approaches. The combination of highly flexible density estimators with widely applicable and sample-efficient inference methods allows one to identify multi-modal distributions over simulators with very mild assumptions (<xref ref-type="bibr" rid="B146">Ramos et al., 2019</xref>; <xref ref-type="bibr" rid="B12">Barcelos et al., 2021</xref>; <xref ref-type="bibr" rid="B126">Muratore et al., 2021c</xref>). By introducing an auxiliary optimality variable and making the policy parameters subject to the inference, we obtain the posterior over policies quantifying their likelihood of being optimal. While this idea is well-known in the control-as-inference community (<xref ref-type="bibr" rid="B148">Rawlik et al., 2012</xref>; <xref ref-type="bibr" rid="B99">Levine and Koltun, 2013</xref>; <xref ref-type="bibr" rid="B173">Watson et al., 2021</xref>), prior methods were limited to less powerful density estimation procedures. Taking this idea one step further, we could additionally include the domain parameters for inference, and thereby establish connections to dual control (<xref ref-type="bibr" rid="B59">Feldbaum, 1960</xref>; <xref ref-type="bibr" rid="B175">Wittenmark, 1995</xref>).</p>
</sec>
<sec id="s6-3-4">
<title>6.3.4 Accounting for the Cost of Information Collection</title>
<p>Another promising direction for future research is the combination of simulated and real-world data collection with explicit consideration of the different costs when sampling from the two domains, subject to a restriction of the overall computational budget. One part of this problem was already addressed by <xref ref-type="bibr" rid="B111">Marco et al. (2017)</xref>, showing how simulation can be used to alleviate the need for real-world samples when finding a set of policy parameters. However, the question of how to schedule the individual (simulated or real) experiments and when to stop the procedure, i.e., when does the cost of gathering information exceed its expected benefit, is not answered for sim-to-real transfer yet. This question relates to the problems of optimal stopping (<xref ref-type="bibr" rid="B33">Chow and Robbins, 1963</xref>) as well as multi-fidelity optimization (<xref ref-type="bibr" rid="B61">Forrester et al., 2007</xref>), and can be seen as a reformulation thereof in the context of simulation-based learning.</p>
</sec>
<sec id="s6-3-5">
<title>6.3.5 Solving Sequential Problems</title>
<p>The problem settings considered in the overwhelming majority of related publications, are (continuous) control tasks which do not have a sequential nature. In contrast, most real-world tasks such as the ones posed at the DARPA Robotics Challenge (<xref ref-type="bibr" rid="B94">Krotkov et al., 2017</xref>) consist of (disconnected) segments, e.g., a robot needs to turn the knob before it can open a door. One possible way to address these more complicated tasks is by splitting the control into high and low level policies, similar to the options framework (<xref ref-type="bibr" rid="B162">Sutton et al., 1999</xref>). The higher level policy is trained to orchestrate the low-level policies which could be learned or fixed. Existing approaches typically realize this with discrete switches between the low-level policies, leading to undesirable abrupt changes in the behavior. An alternative would be a continuous blending of policies, controlled by a special kind of recurrent NN which has originally been proposed by <xref ref-type="bibr" rid="B4">Amari (1977)</xref> to model activities in the human brain. Used as policy architectures they can be constructed to exhibit asymptotically stable nonlinear dynamics (<xref ref-type="bibr" rid="B86">Kishimoto and Amari, 1979</xref>). The main benefits of this structure are its easy interpretability via exhibition and inhibition of neural potentials, as well as the relatively low number of parameters necessary to create complex and adaptive behavior. A variation of this idea with hand-tuned parameters, i.e., without machine learning, has been applied by <xref ref-type="bibr" rid="B105">Luksch et al. (2012)</xref> to coordinate the activation pre-defined movement primitives.</p>
</sec>
</sec>
</sec>
<sec id="s7">
<title>Selection of References</title>
<p>We chose the references based on multiple criteria: 1) Our primary goal was to covering all milestones of the sim-to-real research for robotics. 2) In the process, we aimed at diversifying over subfields and research groups. 3) A large proportion of papers came to our attention by running Google Scholar alerts on &#x201C;sim-to-real&#x201D; and &#x201C;reality gap&#x201D; since 2017. 4) Another source were reverse searches starting from highly influential publications. 5) Some papers came to our attention because of citation notifications we received on our work. 6) Finally, a few of the selected publications are recommendations from reviewers, colleagues, or researchers met at conferences. 7) Peer-reviewed papers were strongly preferred over pre-prints.</p>
</sec>
</body>
<back>
<sec id="s8">
<title>Author Contributions</title>
<p>FM: main author; FR: added and edited text, suggested publications, proofread; GT: added and edited text, suggested publications, proofread; WY: added and edited text, suggested publications, proofread; MG: edited text, proofread, (Ph.D. supervisor of FM); JP: added and edited text, suggested publications, proofread, (Ph.D. supervisor of FM).</p>
</sec>
<sec id="s9">
<title>Funding</title>
<p>FM gratefully acknowledges the financial support from Honda Research Institute Europe. JP received funding from the European Union&#x2019;s Horizon 2020 research and innovation programme under grant agreement No 640554. WY and GT have been supported by NSF award IIS-1514258.</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of Interest</title>
<p>Author FM was employed by the Technical University of Darmstadt in collaboration with the Honda Research Institute Europe. Author FR was employed by NVIDIA. Author WY was employed by Google. Author MG was employed by the Honda Research Institute Europe.</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
<p>The authors declare that this study received funding from the Honda Research Institute Europe. The funder had the following involvement in the study: the structuring and improvement of this article jointly with the authors, and the decision to submit it for publication.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Abdulsamad</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Dorau</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Belousov</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Distributionally Robust Trajectory Optimization under Uncertain Dynamics via Relative-Entropy Trust Regions</article-title>. <comment>arXiv 2103</comment>.<fpage>15388</fpage> </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alghonaim</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Johns</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Benchmarking Domain Randomisation for Visual Sim-To-Real Transfer</article-title>. <comment>arXiv 2011.07112</comment> </citation>
</ref>
<ref id="B3">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Allevato</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Short</surname>
<given-names>E. S.</given-names>
</name>
<name>
<surname>Pryor</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Thomaz</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Tunenet: One-Shot Residual Tuning for System Identification and Sim-To-Real Robot Task Transfer</article-title>. In <conf-name>Conference on Robot Learning (CoRL)</conf-name>, <publisher-loc>Osaka, Japan</publisher-loc>, <conf-date>October 30 - November 1</conf-date> (<publisher-name>PMLR</publisher-name>), vol. <volume>100</volume> of <source>Proc. Machine Learn. Res.</source>, <fpage>445</fpage>&#x2013;<lpage>455</lpage>. </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Amari</surname>
<given-names>S.-i.</given-names>
</name>
</person-group> (<year>1977</year>). <article-title>Dynamics of Pattern Formation in Lateral-Inhibition Type Neural fields</article-title>. <source>Biol. Cybern.</source> <volume>27</volume>, <fpage>77</fpage>&#x2013;<lpage>87</lpage>. <pub-id pub-id-type="doi">10.1007/bf00337259</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Andrychowicz</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Crow</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ray</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Fong</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Welinder</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). &#x201c;<article-title>Hindsight Experience Replay</article-title>,&#x201d; in <conf-name>Conference on Neural Information Processing Systems (NIPS)</conf-name>, <conf-date>December 4-9</conf-date> (<publisher-loc>Long Beach, CA, USA</publisher-loc>, <fpage>5048</fpage>&#x2013;<lpage>5058</lpage>. </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Andrychowicz</surname>
<given-names>O. M.</given-names>
</name>
<name>
<surname>Baker</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Chociej</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>J&#xf3;zefowicz</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>McGrew</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Pachocki</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Learning Dexterous In-Hand Manipulation</article-title>. <source>Int. J. Robotics Res.</source> <volume>39</volume>, <fpage>3</fpage>&#x2013;<lpage>20</lpage>. <pub-id pub-id-type="doi">10.1177/0278364919887447</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Antonova</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Rai</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Kragic</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Bayesian Optimization in Variational Latent Spaces with Dynamic Compression</article-title>,&#x201d; in <conf-name>Conference on Robot Learning (CoRL)</conf-name>, <conf-date>October 30 - November 1</conf-date> (<publisher-loc>Osaka, Japan</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>456</fpage>&#x2013;<lpage>465</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>100</volume> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Asada</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Noda</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Tawaratsumida</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hosoda</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>1996</year>). <article-title>Purposive Behavior Acquisition for a Real Robot by Vision-Based Reinforcement Learning</article-title>. <source>Mach. Learn.</source> <volume>23</volume>, <fpage>279</fpage>&#x2013;<lpage>303</lpage>. <pub-id pub-id-type="doi">10.1023/A:101823700882310.1007/bf00117447</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>&#xc5;str&#xf6;m</surname>
<given-names>K. J.</given-names>
</name>
<name>
<surname>Wittenmark</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2008</year>). <source>Adaptive Control</source>. <edition>2 edn</edition>. <publisher-name>Dover Publications</publisher-name>. </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Atkeson</surname>
<given-names>C. G.</given-names>
</name>
<name>
<surname>H</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>An</surname>
<given-names>C. H.</given-names>
</name>
</person-group> (<year>1986</year>). <article-title>Estimation of Inertial Parameters of Manipulator Loads and Links</article-title>. <source>Int. J. Robotics Res.</source> <volume>5</volume>, <fpage>101</fpage>&#x2013;<lpage>119</lpage>. <pub-id pub-id-type="doi">10.1177/027836498600500306</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Baker</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kanitscheider</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Markov</surname>
<given-names>T. M.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Powell</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>McGrew</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>Emergent Tool Use from Multi-Agent Autocurricula</article-title>,&#x201d; in (<publisher-loc>Addis Ababa, Ethiopia</publisher-loc>. <comment>OpenReview.net</comment>.<conf-name>International Conference on Learning Representations (ICLR)</conf-name>
<conf-date>April 26-30</conf-date> </citation>
</ref>
<ref id="B12">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Barcelos</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lambert</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Oliveira</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Borges</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Boots</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Ramos</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Dual Online Stein Variational Inference for Control and Dynamics</article-title>,&#x201d; in <conf-name>Robotics: Science and Systems (RSS)</conf-name>, <conf-date>July 12-16</conf-date>. <comment>Virtual Event</comment>. <pub-id pub-id-type="doi">10.15607/RSS.2021.XVII.068</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Barcelos</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Oliveira</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Possas</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ott</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ramos</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>DISCO: Double Likelihood-free Inference Stochastic Control</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name>, <conf-date>May 31 - August 31</conf-date> (<publisher-loc>Paris, France</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>10969</fpage>&#x2013;<lpage>10975</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA40945.2020.9196931</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Battaglia</surname>
<given-names>P. W.</given-names>
</name>
<name>
<surname>Hamrick</surname>
<given-names>J. B.</given-names>
</name>
<name>
<surname>Tenenbaum</surname>
<given-names>J. B.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Simulation as an Engine of Physical Scene Understanding</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>110</volume>, <fpage>18327</fpage>&#x2013;<lpage>18332</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1306572110</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Battaglia</surname>
<given-names>P. W.</given-names>
</name>
<name>
<surname>Pascanu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Lai</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Rezende</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Kavukcuoglu</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Interaction Networks for Learning about Objects, Relations and Physics</article-title>,&#x201d; in <conf-name>Conference on Neural Information Processing Systems (NIPS)</conf-name>, <conf-date>December 5-10</conf-date> (<publisher-loc>Barcelona, Spain</publisher-loc>, <fpage>4502</fpage>&#x2013;<lpage>4510</lpage>. </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bayraksan</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Morton</surname>
<given-names>D. P.</given-names>
</name>
</person-group> (<year>2006</year>). <article-title>Assessing Solution Quality in Stochastic Programs</article-title>. <source>Math. Program</source> <volume>108</volume>, <fpage>495</fpage>&#x2013;<lpage>514</lpage>. <pub-id pub-id-type="doi">10.1007/s10107-006-0720-x</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Beaumont</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Cornuet</surname>
<given-names>J.-M.</given-names>
</name>
<name>
<surname>Marin</surname>
<given-names>J.-M.</given-names>
</name>
<name>
<surname>Robert</surname>
<given-names>C. P.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Adaptive Approximate Bayesian Computation</article-title>. <source>Biometrika</source> <volume>96</volume>, <fpage>983</fpage>&#x2013;<lpage>990</lpage>. <pub-id pub-id-type="doi">10.1093/biomet/asp052</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bengio</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Louradour</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Collobert</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Weston</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2009</year>). &#x201c;<article-title>Curriculum Learning</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning (ICML)</conf-name>, <conf-date>June 14-18</conf-date> (<publisher-loc>Montreal, Quebec, Canada</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>41</fpage>&#x2013;<lpage>48</lpage>. <comment>of ACM International Conference Proceeding Series</comment>. <pub-id pub-id-type="doi">10.1145/1553374.1553380</pub-id>
<volume>382</volume> </citation>
</ref>
<ref id="B19">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bin Peng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Coumans</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>T.-W.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Learning Agile Robotic Locomotion Skills by Imitating Animals</article-title>,&#x201d; in <conf-name>Robotics: Science and Systems (RSS), Virtual Event/Corvalis</conf-name>, <conf-date>July 12-16</conf-date> (<publisher-loc>Oregon, USA</publisher-loc>. <pub-id pub-id-type="doi">10.15607/RSS.2020.XVI.064</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bongard</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zykov</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Lipson</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2006</year>). <article-title>Resilient Machines through Continuous Self-Modeling</article-title>. <source>Science</source> <volume>314</volume>, <fpage>1118</fpage>&#x2013;<lpage>1121</lpage>. <pub-id pub-id-type="doi">10.1126/science.1133687</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bousmalis</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Irpan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wohlhart</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kelcey</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kalakrishnan</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). &#x201c;<article-title>Using Simulation and Domain Adaptation to Improve Efficiency of Deep Robotic Grasping</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation</conf-name>, <conf-date>May 21-25</conf-date> (<publisher-loc>Brisbane, Australia</publisher-loc>: <publisher-name>ICRA</publisher-name>), <fpage>4243</fpage>&#x2013;<lpage>4250</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2018.8460875</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brockman</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Cheung</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Pettersson</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Schulman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Openai Gym</article-title>. <comment>
<italic>arXiv</italic> 1606.01540</comment> </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brogliato</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Maschke</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Lozano</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Egeland</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Dissipative Systems Analysis and Control</article-title>. <source>Theor. Appl.</source> <volume>2</volume>. <pub-id pub-id-type="doi">10.1007/978-1-84628-517-2</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Brooks</surname>
<given-names>R. A.</given-names>
</name>
</person-group> (<year>1992</year>). &#x201c;<article-title>Artificial Life and Real Robots</article-title>,&#x201d; in <conf-name>European Conference on Artificial Life (ECAL)</conf-name>, <conf-date>December 11-13</conf-date> (<publisher-loc>Paris, France</publisher-loc>, <fpage>3</fpage>&#x2013;<lpage>10</lpage>. </citation>
</ref>
<ref id="B25">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Brosseit</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hahner</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Muratore</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Gienger</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <source>Destilled Domain Randomization</source>, <volume>2112</volume>, <fpage>03149</fpage>. </citation>
</ref>
<ref id="B26">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Calandra</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ivaldi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Deisenroth</surname>
<given-names>M. P.</given-names>
</name>
<name>
<surname>Rueckert</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Learning Inverse Dynamics Models with Contacts</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name>, <conf-date>26-30 May</conf-date> (<publisher-loc>Seattle, WA, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>3186</fpage>&#x2013;<lpage>3191</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2015.7139638</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Carlini</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Wagner</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Adversarial Examples Are Not Easily Detected</article-title>,&#x201d; in <conf-name>Workshop on Artificial Intelligence and Security (AISec)</conf-name>, <conf-date>November 3</conf-date> (<publisher-loc>Dallas, TX, USA</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>3</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1145/3128572.3140444</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chatzilygeroudis</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Mouret</surname>
<given-names>J.-B.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Using Parameterized Black-Box Priors to Scale up Model-Based Policy Search for Robotics</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name>, <conf-date>May 21-25</conf-date> (<publisher-loc>Brisbane, Australia</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2018.8461083</pub-id> </citation>
</ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chatzilygeroudis</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Rama</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Kaushik</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Goepp</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Vassiliades</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Mouret</surname>
<given-names>J.-B.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Black-box Data-Efficient Policy Search for Robotics</article-title>,&#x201d; in <conf-name>International Conference on Intelligent Robots and Systems (IROS)</conf-name>, <conf-date>September 24-28</conf-date> (<publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>CanadaIEEE</publisher-name>), <fpage>51</fpage>&#x2013;<lpage>58</lpage>. <pub-id pub-id-type="doi">10.1109/IROS.2017.8202137</pub-id> </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chatzilygeroudis</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Vassiliades</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Stulp</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Calinon</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mouret</surname>
<given-names>J.-B.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A Survey on Policy Search Algorithms for Learning Robot Controllers in a Handful of Trials</article-title>. <source>IEEE Trans. Robot.</source> <volume>36</volume>, <fpage>328</fpage>&#x2013;<lpage>347</lpage>. <pub-id pub-id-type="doi">10.1109/TRO.2019.2958211</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chebotar</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Handa</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Makoviychuk</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Macklin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Issac</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ratliff</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Closing the Sim-To-Real Loop: Adapting Simulation Randomization with Real World Experience</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name>, <conf-date>May 20-24</conf-date> (<publisher-loc>Montreal, QC, Canada</publisher-loc>, <fpage>8973</fpage>&#x2013;<lpage>8979</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2019.8793789</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>T. X.</given-names>
</name>
<name>
<surname>Chandraker</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Learning Efficient Object Detection Models with Knowledge Distillation</article-title>,&#x201d; in <conf-name>Conference on Neural Information Processing Systems (NIPS)</conf-name>, <conf-date>December 4-9</conf-date> (<publisher-loc>Long Beach, CA, USA</publisher-loc>, <fpage>742</fpage>&#x2013;<lpage>751</lpage>. </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chow</surname>
<given-names>Y. S.</given-names>
</name>
<name>
<surname>Robbins</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>1963</year>). <article-title>On Optimal Stopping Rules</article-title>. <source>Z. Wahrscheinlichkeitstheorie Verw Gebiete</source> <volume>2</volume>, <fpage>33</fpage>&#x2013;<lpage>49</lpage>. <pub-id pub-id-type="doi">10.1007/bf00535296</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Christiano</surname>
<given-names>P. F.</given-names>
</name>
<name>
<surname>Shah</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Mordatch</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Blackwell</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tobin</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Transfer from Simulation to Real World through Learning Deep Inverse Dynamics Model</article-title>. <comment>
<italic>arXiv</italic> 1610.03518</comment> </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chung</surname>
<given-names>S.-J.</given-names>
</name>
<name>
<surname>Pollard</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Predictable Behavior during Contact Simulation: a Comparison of Selected Physics Engines</article-title>. <source>Comp. Anim. Virtual Worlds</source> <volume>27</volume>, <fpage>262</fpage>&#x2013;<lpage>270</lpage>. <pub-id pub-id-type="doi">10.1002/cav.1712</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ciresan</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Meier</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Schmidhuber</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>Multi-column Deep Neural Networks for Image Classification</article-title>,&#x201d; in <conf-name>Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-date>June 16-21</conf-date> (<publisher-loc>RI, USA</publisher-loc>: <publisher-name>IEEE Computer Society</publisher-name>), <fpage>3642</fpage>&#x2013;<lpage>3649</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2012.6248110</pub-id> </citation>
</ref>
<ref id="B37">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Collins</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Howard</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Leitner</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Quantifying the Reality gap in Robotic Manipulation Tasks</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name>, <conf-date>May 20-24</conf-date> (<publisher-loc>Montreal, QC, Canada</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>6706</fpage>&#x2013;<lpage>6712</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2019.8793591</pub-id> </citation>
</ref>
<ref id="B38">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Coumans</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Tiny Differentiable Simulator</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/google-research/tiny-differentiable-simulator">https://github.com/google-research/tiny-differentiable-simulator</ext-link>
</comment>. </citation>
</ref>
<ref id="B39">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Craik</surname>
<given-names>K. J. W.</given-names>
</name>
</person-group> (<year>1943</year>). <source>The Nature of Explanation</source>. </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cranmer</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Brehmer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Louppe</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>The Frontier of Simulation-Based Inference</article-title>. <source>Proc. Natl. Acad. Sci. USA</source> <volume>117</volume>, <fpage>30055</fpage>&#x2013;<lpage>30062</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.1912789117</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cui</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kingsbury</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Ramabhadran</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Saon</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Sercu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Audhkhasi</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). &#x201c;<article-title>Knowledge Distillation across Ensembles of Multilingual Models for Low-Resource Languages</article-title>,&#x201d; in <conf-name>Knowledge distillation across ensembles of multilingual models for low-resource languages</conf-name>, <conf-date>March 5-9</conf-date> (<publisher-loc>ICASSP, New Orleans, LA, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4825</fpage>&#x2013;<lpage>4829</lpage>. <pub-id pub-id-type="doi">10.1109/ICASSP.2017.7953073</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cully</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Clune</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tarapore</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Mouret</surname>
<given-names>J.-B.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Robots that Can Adapt like Animals</article-title>. <source>Nature</source> <volume>521</volume>, <fpage>503</fpage>&#x2013;<lpage>507</lpage>. <pub-id pub-id-type="doi">10.1038/nature14422</pub-id> </citation>
</ref>
<ref id="B43">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cutler</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>How</surname>
<given-names>J. P.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Autonomous Drifting Using Simulation-Aided Reinforcement Learning</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name>, <conf-date>May 16-21</conf-date> (<publisher-loc>Stockholm, Sweden</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>5442</fpage>&#x2013;<lpage>5448</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2016.7487756</pub-id> </citation>
</ref>
<ref id="B44">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cutler</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>How</surname>
<given-names>J. P.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Efficient Reinforcement Learning for Robots Using Informative Simulated Priors</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name>, <conf-date>26-30 May</conf-date> (<publisher-loc>Seattle, WA, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2605</fpage>&#x2013;<lpage>2612</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2015.7139550</pub-id> </citation>
</ref>
<ref id="B45">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Czarnecki</surname>
<given-names>W. M.</given-names>
</name>
<name>
<surname>Pascanu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Osindero</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Jayakumar</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Swirszcz</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Jaderberg</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Distilling Policy Distillation</article-title>,&#x201d; in <conf-name>International Conference on Artificial Intelligence and Statistics (AISTATS)</conf-name> (<publisher-loc>Naha, Okinawa, Japan16-18 April</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>1331</fpage>&#x2013;<lpage>1340</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>89</volume> </citation>
</ref>
<ref id="B46">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Dai</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Landry</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Pavone</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Tedrake</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Lyapunov-stable Neural-Network Control</article-title>,&#x201d; in <conf-name>Robotics: Science and Systems (RSS)</conf-name>, <conf-date>July 12-16</conf-date>. <comment>Virtual Event</comment>. <pub-id pub-id-type="doi">10.15607/RSS.2021.XVII.063</pub-id> </citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dai</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Arulkumaran</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Tukra</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Behbahani</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Bharath</surname>
<given-names>A. A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Analysing Deep Reinforcement Learning Agents Trained with Domain Randomisation</article-title>. <comment>
<italic>arXiv</italic> 1912</comment>.<fpage>08324</fpage> </citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Degrave</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hermans</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Dambre</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wyffels</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A Differentiable Physics Engine for Deep Learning in Robotics</article-title>. <source>Front. Neurorobot.</source> <volume>13</volume>, <fpage>6</fpage>. <pub-id pub-id-type="doi">10.3389/fnbot.2019.00006</pub-id> </citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deisenroth</surname>
<given-names>M. P.</given-names>
</name>
<name>
<surname>Neumann</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>A Survey on Policy Search for Robotics</article-title>. <source>FNT in Robotics</source> <volume>2</volume>, <fpage>1</fpage>&#x2013;<lpage>142</lpage>. <pub-id pub-id-type="doi">10.1561/2300000021</pub-id> </citation>
</ref>
<ref id="B50">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Deisenroth</surname>
<given-names>M. P.</given-names>
</name>
<name>
<surname>Rasmussen</surname>
<given-names>C. E.</given-names>
</name>
</person-group> (<year>2011</year>). &#x201c;<article-title>PILCO: a Model-Based and Data-Efficient Approach to Policy Search</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning (ICML)</conf-name>, <conf-date>June 28 - July 2</conf-date> (<publisher-loc>Bellevue, Washington, USA</publisher-loc>, <fpage>465</fpage>&#x2013;<lpage>472</lpage>. </citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Delage</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Distributionally Robust Optimization under Moment Uncertainty with Application to Data-Driven Problems</article-title>. <source>Operations Res.</source> <volume>58</volume>, <fpage>595</fpage>&#x2013;<lpage>612</lpage>. <pub-id pub-id-type="doi">10.1287/opre.1090.0741</pub-id> </citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dennett</surname>
<given-names>D. C.</given-names>
</name>
</person-group> (<year>1975</year>). <article-title>Why the Law of Effect Will Not Go Away</article-title>. <source>J. Theor. Soc. Behav.</source> <pub-id pub-id-type="doi">10.1111/j.1468-5914.1975.tb00350.x</pub-id> </citation>
</ref>
<ref id="B53">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Du</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Watkins</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Darrell</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pathak</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). <source>Auto-tuned Sim-To-Real Transfer</source>. <comment>
<italic>arXiv</italic> 2104.07662</comment>. </citation>
</ref>
<ref id="B54">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Durkan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Murray</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Papamakarios</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>On Contrastive Learning for Likelihood-free Inference</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning (ICML)</conf-name>, <conf-date>July 13-18</conf-date> (<publisher-name>PMLR</publisher-name>), <fpage>2771</fpage>&#x2013;<lpage>2781</lpage>. <comment>Virtual Eventof Proceedings of Machine Learning Research</comment>.<volume>119</volume> </citation>
</ref>
<ref id="B55">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Erez</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Smart</surname>
<given-names>W. D.</given-names>
</name>
</person-group> (<year>2008</year>). &#x201c;<article-title>What Does Shaping Mean for Computational Reinforcement Learning?</article-title>,&#x201d; in <conf-name>International Conference on Development and Learning (ICDL)</conf-name> (<publisher-loc>Monterey, CA, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>215</fpage>&#x2013;<lpage>219</lpage>. </citation>
</ref>
<ref id="B56">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Erez</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tassa</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Todorov</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Simulation Tools for Model-Based Robotics: Comparison of Bullet, Havok, Mujoco, ODE and Physx</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name>, <conf-date>May 26-30</conf-date> (<publisher-loc>Seattle, WA, USA</publisher-loc>, <fpage>4397</fpage>&#x2013;<lpage>4404</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2015.7139807</pub-id> </citation>
</ref>
<ref id="B57">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Fawzi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Fawzi</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Frossard</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Fundamental Limits on Adversarial Robustness</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning (ICML), Workshop on Deep Learning</conf-name>. </citation>
</ref>
<ref id="B58">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fearnhead</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Prangle</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Constructing Summary Statistics for Approximate Bayesian Computation: Semi-automatic Approximate Bayesian Computation</article-title>. <source>J. R. Stat. Soc.</source> <volume>74</volume>, <fpage>419</fpage>&#x2013;<lpage>474</lpage>. <pub-id pub-id-type="doi">10.1111/j.1467-9868.2011.01010.x</pub-id> </citation>
</ref>
<ref id="B59">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Feldbaum</surname>
<given-names>A. A.</given-names>
</name>
</person-group> (<year>1960</year>). <article-title>Dual Control Theory. I</article-title>. <source>Avtomatika i Telemekhanika</source> <volume>21</volume>, <fpage>1240</fpage>&#x2013;<lpage>1249</lpage>. </citation>
</ref>
<ref id="B60">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Finn</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Model-agnostic Meta-Learning for Fast Adaptation of Deep Networks</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning</conf-name>, <conf-date>6-11 August</conf-date> (<publisher-loc>Sydney, NSW, Australia</publisher-loc>: <publisher-name>ICML</publisher-name>), <fpage>1126</fpage>&#x2013;<lpage>1135</lpage>. </citation>
</ref>
<ref id="B61">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Forrester</surname>
<given-names>A. I. J.</given-names>
</name>
<name>
<surname>S&#xf3;bester</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Keane</surname>
<given-names>A. J.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Multi-fidelity Optimization via Surrogate Modelling</article-title>. <source>Proc. R. Soc. A.</source> <volume>463</volume>, <fpage>3251</fpage>&#x2013;<lpage>3269</lpage>. <pub-id pub-id-type="doi">10.1098/rspa.2007.1900</pub-id> </citation>
</ref>
<ref id="B62">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Goodfellow</surname>
<given-names>I. J.</given-names>
</name>
<name>
<surname>Shlens</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Szegedy</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Explaining and Harnessing Adversarial Examples</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR)</conf-name>, <conf-date>May 7-9</conf-date> (<publisher-loc>San Diego, CA, USA</publisher-loc>. <comment>Conference Track</comment>. </citation>
</ref>
<ref id="B63">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Grant</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Finn</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Darrell</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Griffiths</surname>
<given-names>T. L.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Recasting Gradient-Based Meta-Learning as Hierarchical Bayes</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR)</conf-name>, <conf-date>April 30 - May 3, 2018</conf-date> (<publisher-loc>Vancouver, BC, Canada</publisher-loc>. <comment>Conference Track (OpenReview.net)</comment>. </citation>
</ref>
<ref id="B64">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Greenberg</surname>
<given-names>D. S.</given-names>
</name>
<name>
<surname>Nonnenmacher</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Macke</surname>
<given-names>J. H.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Automatic Posterior Transformation for Likelihood-free Inference</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning (ICML)</conf-name>, <conf-date>9-15 June</conf-date> (<publisher-loc>Long Beach, California, USA</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>2404</fpage>&#x2013;<lpage>2414</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>97</volume> </citation>
</ref>
<ref id="B65">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Greydanus</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dzamba</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yosinski</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Hamiltonian Neural Networks</article-title>,&#x201d; in <conf-name>Conference on Neural Information Processing Systems (NeurIPS)</conf-name>, <conf-date>December 8-14</conf-date> (<publisher-loc>Vancouver, BC, Canada</publisher-loc>, <fpage>15353</fpage>&#x2013;<lpage>15363</lpage>. </citation>
</ref>
<ref id="B66">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>H&#xf6;fer</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Bekris</surname>
<given-names>K. E.</given-names>
</name>
<name>
<surname>Handa</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Higuera</surname>
<given-names>J. C. G.</given-names>
</name>
<name>
<surname>Golemo</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Mozifian</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Perspectives on Sim2real Transfer for Robotics: A Summary of the R:SS 2020 Workshop</article-title>. <comment>
<italic>arXiv</italic> 2012.03806.</comment> </citation>
</ref>
<ref id="B67">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Haefner</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Queau</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Mollenhoff</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Cremers</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Fight Ill-Posedness with Ill-Posedness: Single-Shot Variational Depth Super-resolution from Shading</article-title>,&#x201d; in <conf-name>Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-date>June 18-22</conf-date> (<publisher-loc>Salt Lake City, UT, USA</publisher-loc>: <publisher-name>IEEE Computer Society</publisher-name>), <fpage>164</fpage>&#x2013;<lpage>174</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2018.00025</pub-id> </citation>
</ref>
<ref id="B68">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hanna</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Stone</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Grounded Action Transformation for Robot Learning in Simulation</article-title>,&#x201d; in <conf-name>AAAI Conference on Artificial Intelligence</conf-name>, <conf-date>February 4-9</conf-date> (<publisher-loc>San Francisco, California, USA</publisher-loc>, <fpage>3834</fpage>&#x2013;<lpage>3840</lpage>. </citation>
</ref>
<ref id="B69">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Heiden</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Millard</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Coumans</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Sheng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sukhatme</surname>
<given-names>G. S.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>NeuralSim: Augmenting Differentiable Simulators with Neural Networks</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name>, <conf-date>May 30 - June 5</conf-date> (<publisher-loc>Xi&#x2019;an, China</publisher-loc>. <pub-id pub-id-type="doi">10.1109/icra48506.2021.9560935</pub-id> </citation>
</ref>
<ref id="B70">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Hermans</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Begy</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Louppe</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>)., <volume>119</volume>. <publisher-name>PMLR</publisher-name>, <fpage>4239</fpage>&#x2013;<lpage>4248</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<article-title>Likelihood-free MCMC with Amortized Approximate Ratio Estimators</article-title>
<conf-name>International Conference on Machine Learning (ICML), Virtual Event</conf-name>
<conf-date>13-18 July</conf-date> </citation>
</ref>
<ref id="B71">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hinton</surname>
<given-names>G. E.</given-names>
</name>
<name>
<surname>Vinyals</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Dean</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Distilling the Knowledge in a Neural Network</article-title>. <comment>
<italic>arXiv</italic> 1503.02531.</comment> </citation>
</ref>
<ref id="B72">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Spielberg</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Tenenbaum</surname>
<given-names>J. B.</given-names>
</name>
<name>
<surname>Freeman</surname>
<given-names>W. T.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Chainqueen: A Real-Time Differentiable Physical Simulator for Soft Robotics</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name>, <conf-date>May 20-24</conf-date> (<publisher-loc>Montreal, QC, Canada</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>6265</fpage>&#x2013;<lpage>6271</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2019.8794333</pub-id> </citation>
</ref>
<ref id="B73">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>S. H.</given-names>
</name>
<name>
<surname>Papernot</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Goodfellow</surname>
<given-names>I. J.</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Adversarial Attacks on Neural Network Policies, Workshop Track</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR) Toulon</conf-name>, <conf-date>April 24-26</conf-date> (<publisher-loc>France</publisher-loc>. <comment>OpenReview.net)</comment>. </citation>
</ref>
<ref id="B74">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ilyas</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Santurkar</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Tsipras</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Engstrom</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Tran</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Madry</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Adversarial Examples Are Not Bugs, They Are Features</article-title>,&#x201d; in <conf-name>Conference on Neural Information Processing Systems (NeurIPS)</conf-name>, <conf-date>December 8-14</conf-date> (<publisher-loc>Vancouver, BC, Canada</publisher-loc>, <fpage>125</fpage>&#x2013;<lpage>136</lpage>. </citation>
</ref>
<ref id="B75">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ivaldi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Padois</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Nori</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Tools for Simulating Humanoid Robot Dynamics: A Survey Based on User Feedback</article-title>,&#x201d; in <conf-name>Tools for simulating humanoid robot dynamics: A survey based on user feedback</conf-name>, <conf-date>November 18-20</conf-date> (<publisher-loc>Humanoids, Madrid, Spain</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>842</fpage>&#x2013;<lpage>849</lpage>. <pub-id pub-id-type="doi">10.1109/HUMANOIDS.2014.7041462</pub-id> </citation>
</ref>
<ref id="B76">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jakobi</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>1997</year>). <article-title>Evolutionary Robotics and the Radical Envelope-Of-Noise Hypothesis</article-title>. <source>Adaptive Behav.</source> <volume>6</volume>, <fpage>325</fpage>&#x2013;<lpage>368</lpage>. <pub-id pub-id-type="doi">10.1177/105971239700600205</pub-id> </citation>
</ref>
<ref id="B77">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Jakobi</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Husbands</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Harvey</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>1995</year>). &#x201c;<article-title>Noise and the Reality gap: The Use of Simulation in Evolutionary Robotics</article-title>,&#x201d; in <conf-name>Advances in Artificial Life</conf-name>, <conf-date>June 4-6</conf-date> (<publisher-loc>Granada, Spain</publisher-loc>, <fpage>704</fpage>&#x2013;<lpage>720</lpage>. <comment>704&#x2013;720</comment>. <pub-id pub-id-type="doi">10.1007/3-540-59496-5_337</pub-id> </citation>
</ref>
<ref id="B78">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>James</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Davison</surname>
<given-names>A. J.</given-names>
</name>
<name>
<surname>Johns</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Transferring End-To-End Visuomotor Control from Simulation to Real World for a Multi-Stage Task</article-title>,&#x201d; in <conf-name>Conference on Robot Learning (CoRL)</conf-name>, <conf-date>November 13-15</conf-date> (<publisher-loc>Mountain View, California, USA</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>334</fpage>&#x2013;<lpage>343</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>78</volume> </citation>
</ref>
<ref id="B79">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>James</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wohlhart</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Kalakrishnan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kalashnikov</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Irpan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ibarz</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Sim-to-real via Sim-To-Sim: Data-Efficient Robotic Grasping via Randomized-To-Canonical Adaptation Networks</article-title>,&#x201d; in <conf-name>Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <conf-date>June 16-20</conf-date> (<publisher-loc>Long Beach, CA, USA</publisher-loc>: <publisher-name>Computer Vision Foundation/IEEE</publisher-name>), <fpage>12627</fpage>&#x2013;<lpage>12637</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2019.01291</pub-id> </citation>
</ref>
<ref id="B80">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ho</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>C. K.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Simgan: Hybrid Simulator Identification for Domain Adaptation via Adversarial Reinforcement Learning</article-title>. <comment>
<italic>arXiv</italic> 2101.06005</comment> </citation>
</ref>
<ref id="B81">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>K&#xf6;rber</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lange</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Rediske</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Steinmann</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gl&#xfc;ck</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Comparing Popular Simulation Environments in the Scope of Robotics and Reinforcement Learning</article-title>. <comment>
<italic>arXiv</italic> 2103</comment>.<fpage>04616</fpage> </citation>
</ref>
<ref id="B82">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kadian</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Truong</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gokaslan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Clegg</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wijmans</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Sim2real Predictivity: Does Evaluation in Simulation Predict Real-World Performance?</article-title> <source>IEEE Robot. Autom. Lett.</source> <volume>5</volume>, <fpage>6670</fpage>&#x2013;<lpage>6677</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2020.3013848</pub-id> </citation>
</ref>
<ref id="B83">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kahn</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Marshall</surname>
<given-names>A. W.</given-names>
</name>
</person-group> (<year>1953</year>). <article-title>Methods of Reducing Sample Size in Monte Carlo Computations</article-title>. <source>Or</source> <volume>1</volume>, <fpage>263</fpage>&#x2013;<lpage>278</lpage>. <pub-id pub-id-type="doi">10.1287/opre.1.5.263</pub-id> </citation>
</ref>
<ref id="B84">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kakade</surname>
<given-names>S. M.</given-names>
</name>
</person-group> (<year>2001</year>). <publisher-loc>VancouverBritish Columbia, Canada</publisher-loc>, <fpage>1531</fpage>&#x2013;<lpage>1538</lpage>.<article-title>A Natural Policy Gradient</article-title>
<conf-name>Conference on Neural Information Processing Systems (NIPS)</conf-name>
<conf-date>December 3-8</conf-date>. </citation>
</ref>
<ref id="B85">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kaspar</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Munoz Osorio</surname>
<given-names>J. D.</given-names>
</name>
<name>
<surname>Bock</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Sim2real Transfer for Reinforcement Learning without Dynamics Randomization</article-title>,&#x201d; in <conf-name>International Conference on Intelligent Robots and Systems (IROS)</conf-name>, <conf-date>October 24 - January 24</conf-date> (<publisher-loc>Las Vegas, NV, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4383</fpage>&#x2013;<lpage>4388</lpage>. <pub-id pub-id-type="doi">10.1109/IROS45743.2020.9341260</pub-id> </citation>
</ref>
<ref id="B86">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kishimoto</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Amari</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>1979</year>). <article-title>Existence and Stability of Local Excitations in Homogeneous Neural fields</article-title>. <source>J. Math. Biol.</source> <volume>7</volume>, <fpage>303</fpage>&#x2013;<lpage>318</lpage>. <pub-id pub-id-type="doi">10.1007/bf00275151</pub-id> </citation>
</ref>
<ref id="B87">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Klink</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Abdulsamad</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Belousov</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>D&#x2019;Eramo</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pajarinen</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <source>A Probabilistic Interpretation of Self-Paced Learning with Applications to Reinforcement Learning</source>, <fpage>13176</fpage>. <comment>
<italic>arXiv</italic> 2102</comment>. </citation>
</ref>
<ref id="B88">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Klink</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Abdulsamad</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Belousov</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Self-paced Contextual Reinforcement Learning</article-title>,&#x201d; in <conf-name>Conference on Robot Learning (CoRL)</conf-name>, <conf-date>October 30 - November 1</conf-date> (<publisher-loc>Osaka, Japan</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>513</fpage>&#x2013;<lpage>529</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>100</volume>. </citation>
</ref>
<ref id="B89">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kober</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bagnell</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Reinforcement Learning in Robotics: A Survey</article-title>. <source>Int. J. Robotics Res.</source> <volume>32</volume>, <fpage>1238</fpage>&#x2013;<lpage>1274</lpage>. <pub-id pub-id-type="doi">10.1177/0278364913495721</pub-id> </citation>
</ref>
<ref id="B90">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kolev</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Klodt</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Brox</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Cremers</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Continuous Global Optimization in Multiview 3d Reconstruction</article-title>. <source>Int. J. Comput. Vis.</source> <volume>84</volume>, <fpage>80</fpage>&#x2013;<lpage>96</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-009-0233-1</pub-id> </citation>
</ref>
<ref id="B91">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Koos</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mouret</surname>
<given-names>J.-B.</given-names>
</name>
<name>
<surname>Doncieux</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Crossing the Reality gap in Evolutionary Robotics by Promoting Transferable Controllers</article-title>,&#x201d; in <conf-name>Genetic and Evolutionary Computation Conference (GECCO)</conf-name>, <conf-date>July 7-11</conf-date> (<publisher-loc>Portland, Oregon, USA</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>119</fpage>&#x2013;<lpage>126</lpage>. <pub-id pub-id-type="doi">10.1145/1830483.1830505</pub-id> </citation>
</ref>
<ref id="B92">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Koos</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mouret</surname>
<given-names>J.-B.</given-names>
</name>
<name>
<surname>Doncieux</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>The Transferability Approach: Crossing the Reality gap in Evolutionary Robotics</article-title>. <source>IEEE Trans. Evol. Computat.</source> <volume>17</volume>, <fpage>122</fpage>&#x2013;<lpage>145</lpage>. <pub-id pub-id-type="doi">10.1109/TEVC.2012.2185849</pub-id> </citation>
</ref>
<ref id="B93">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Krizhevsky</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sutskever</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Hinton</surname>
<given-names>G. E.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>Imagenet Classification with Deep Convolutional Neural Networks</article-title>,&#x201d; in <conf-name>Conference on Neural Information Processing Systems (NIPS)</conf-name>, <fpage>1106</fpage>&#x2013;<lpage>1114</lpage>.<source>Lake Tahoe, Nev. United States December</source>
<volume>3-6</volume> </citation>
</ref>
<ref id="B94">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krotkov</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Hackett</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jackel</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Perschbacher</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Pippine</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Strauss</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>The DARPA Robotics challenge Finals: Results and Perspectives</article-title>. <source>J. Field Robotics</source> <volume>34</volume>, <fpage>229</fpage>&#x2013;<lpage>240</lpage>. <pub-id pub-id-type="doi">10.1002/rob.21683</pub-id> </citation>
</ref>
<ref id="B95">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Pathak</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Malik</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>RMA: Rapid Motor Adaptation for Legged Robots</article-title>,&#x201d; in <conf-name>Robotics: Science and Systems (RSS), Virtual Event</conf-name>, <conf-date>July 12-16</conf-date>. <pub-id pub-id-type="doi">10.15607/RSS.2021.XVII.011</pub-id> </citation>
</ref>
<ref id="B96">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname>
<given-names>M. P.</given-names>
</name>
<name>
<surname>Packer</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Koller</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Self-paced Learning for Latent Variable Models</article-title>,&#x201d; in <conf-name>Conference on Neural Information Processing Systems (NIPS)</conf-name>, <conf-date>6-9 December</conf-date> (<publisher-loc>Vancouver, British Columbia, Canada</publisher-loc>, <fpage>1189</fpage>&#x2013;<lpage>1197</lpage>. </citation>
</ref>
<ref id="B97">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kurakin</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Goodfellow</surname>
<given-names>I. J.</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Adversarial Examples in the Physical World</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR) Toulon</conf-name>, <conf-date>April 24-26</conf-date> (<publisher-loc>France</publisher-loc>. <comment>Workshop Track (OpenReview.net)</comment>. </citation>
</ref>
<ref id="B98">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Landau</surname>
<given-names>I. D.</given-names>
</name>
<name>
<surname>Lozano</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>M&#x2019;Saad</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Karimi</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2011</year>). <source>Adaptive Control: Algorithms, Analysis and Applications</source>. <edition>2 edn</edition>. <publisher-name>Springer Science &#x26; Business Media</publisher-name>. </citation>
</ref>
<ref id="B99">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Koltun</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2013</year>). &#x201c;<article-title>Variational Policy Search via Trajectory Optimization</article-title>,&#x201d; in <conf-name>Conference on Neural Information Processing Systems (NIPS)</conf-name>, <conf-date>December 5-8</conf-date> (<publisher-loc>Lake Tahoe, Nevada, USA</publisher-loc>, <fpage>207</fpage>&#x2013;<lpage>215</lpage>. </citation>
</ref>
<ref id="B100">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Pastor</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Krizhevsky</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ibarz</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Quillen</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Learning Hand-Eye Coordination for Robotic Grasping with Deep Learning and Large-Scale Data Collection</article-title>. <source>Int. J. Robotics Res.</source> <volume>37</volume>, <fpage>421</fpage>&#x2013;<lpage>436</lpage>. <pub-id pub-id-type="doi">10.1177/0278364917710318</pub-id> </citation>
</ref>
<ref id="B101">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lillicrap</surname>
<given-names>T. P.</given-names>
</name>
<name>
<surname>Hunt</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Pritzel</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Heess</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Erez</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tassa</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). &#x201c;<article-title>Continuous Control with Deep Reinforcement Learning</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR)</conf-name>, <conf-date>May 2-4</conf-date> (<publisher-loc>San Juan, Puerto Rico</publisher-loc>. <comment>Conference Track (OpenReview.net)</comment>. </citation>
</ref>
<ref id="B102">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ramachandran</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Stein Variational Policy Gradient</article-title>,&#x201d; in <conf-name>Association for Uncertainty in Artificial Intelligence (UAI)</conf-name> (<publisher-loc>Sydney, Australia</publisher-loc>, <fpage>August 11</fpage>&#x2013;<lpage>15</lpage>. </citation>
</ref>
<ref id="B103">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lowrey</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kolev</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Rajeswaran</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Todorov</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Reinforcement Learning for Non-prehensile Manipulation: Transfer from Simulation to Physical System</article-title>,&#x201d; in <conf-name>Simulation, Modeling, and Programming for Autonomous Robots (SIMPAR)</conf-name>, <conf-date>May 16-19</conf-date> (<publisher-loc>Brisbane, Australia</publisher-loc>, <fpage>35</fpage>&#x2013;<lpage>42</lpage>. <pub-id pub-id-type="doi">10.1109/SIMPAR.2018.8376268</pub-id> </citation>
</ref>
<ref id="B104">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lueckmann</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gon&#xe7;alves</surname>
<given-names>P. J.</given-names>
</name>
<name>
<surname>Bassetto</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>&#xd6;cal</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Nonnenmacher</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Macke</surname>
<given-names>J. H.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Flexible Statistical Inference for Mechanistic Models of Neural Dynamics</article-title>,&#x201d; in <conf-name>Conference on Neural Information Processing Systems</conf-name>, <conf-date>December 4-9</conf-date> (<publisher-loc>Long Beach, CA, USA</publisher-loc>: <publisher-name>NIPS</publisher-name>), <fpage>1289</fpage>&#x2013;<lpage>1299</lpage>. </citation>
</ref>
<ref id="B105">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Luksch</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Gienger</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>M&#xfc;hlig</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yoshiike</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>Adaptive Movement Sequences and Predictive Decisions Based on Hierarchical Dynamical Systems</article-title>,&#x201d; in <conf-name>International Conference on Intelligent Robots and Systems (IROS)</conf-name>, <conf-date>October 7-12</conf-date> (<publisher-loc>Vilamoura, Algarve, Portugal</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2082</fpage>&#x2013;<lpage>2088</lpage>. <pub-id pub-id-type="doi">10.1109/iros.2012.6385651</pub-id> </citation>
</ref>
<ref id="B106">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lutter</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Mannor</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Garg</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021a</year>). <source>Robust Value Iteration for Continuous Control Tasks</source>. <comment>
<italic>arXiv</italic> 2105.12189</comment>. </citation>
</ref>
<ref id="B107">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lutter</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ritter</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Deep Lagrangian Networks: Using Physics as Model Prior for Deep Learning</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR)</conf-name>, <conf-date>May 6-9</conf-date> (<publisher-loc>New Orleans, LA, USA</publisher-loc>. <comment>Conference Track (OpenReview.net)</comment>. </citation>
</ref>
<ref id="B108">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lutter</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Silberbauer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Watson</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021b</year>). <article-title>Differentiable Physics Models for Real-World Offline Model-Based Reinforcement Learning</article-title>. <comment>
<italic>arXiv</italic> 2011.01734</comment> </citation>
</ref>
<ref id="B109">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mak</surname>
<given-names>W.-K.</given-names>
</name>
<name>
<surname>Morton</surname>
<given-names>D. P.</given-names>
</name>
<name>
<surname>Wood</surname>
<given-names>R. K.</given-names>
</name>
</person-group> (<year>1999</year>). <article-title>Monte Carlo Bounding Techniques for Determining Solution Quality in Stochastic Programs</article-title>. <source>Operations Res. Lett.</source> <volume>24</volume>, <fpage>47</fpage>&#x2013;<lpage>56</lpage>. <pub-id pub-id-type="doi">10.1016/S0167-6377(98)00054-6</pub-id> </citation>
</ref>
<ref id="B110">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Mandlekar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Garg</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Fei-Fei</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Savarese</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). <publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>Canada</publisher-name>. <comment>September <italic>24-28</italic>. 3932&#x2013;3939</comment>. <pub-id pub-id-type="doi">10.1109/IROS.2017.8206245</pub-id>
<article-title>Adversarially Robust Policy Learning: Active Construction of Physically-Plausible Perturbations</article-title>
<conf-name>International Conference on Intelligent Robots and Systems (IROS)</conf-name> </citation>
</ref>
<ref id="B111">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Marco</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Berkenkamp</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Hennig</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Schoellig</surname>
<given-names>A. P.</given-names>
</name>
<name>
<surname>Krause</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Schaal</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). &#x201c;<article-title>Virtual vs. Real: Trading off Simulations and Physical Experiments in Reinforcement Learning with Bayesian Optimization</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name>, <conf-date>May 29 - Jun 3</conf-date> (<publisher-loc>Marina Bay Sands, Singapore</publisher-loc>. <pub-id pub-id-type="doi">10.1109/icra.2017.7989186</pub-id> </citation>
</ref>
<ref id="B112">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Marjoram</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Molitor</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Plagnol</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Tavare</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2003</year>). <article-title>Markov Chain Monte Carlo without Likelihoods</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>100</volume>, <fpage>15324</fpage>&#x2013;<lpage>15328</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.0306899100</pub-id> </citation>
</ref>
<ref id="B113">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Martin Martin</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Brock</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Online Interactive Perception of Articulated Objects with Multi-Level Recursive Estimation Based on Task-specific Priors</article-title>,&#x201d; in <conf-name>International Conference on Intelligent Robots and Systems (IROS)</conf-name>, <conf-date>September 14-18</conf-date> (<publisher-loc>Chicago, IL, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2494</fpage>&#x2013;<lpage>2501</lpage>. <pub-id pub-id-type="doi">10.1109/IROS.2014.6942902</pub-id> </citation>
</ref>
<ref id="B114">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Matas</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>James</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Davison</surname>
<given-names>A. J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Sim-to-real Reinforcement Learning for Deformable Object Manipulation</article-title>,&#x201d; in <conf-name>Conference on Robot Learning (CoRL)</conf-name>, <conf-date>October 29-31</conf-date> (<publisher-loc>Z&#xfc;rich, Switzerland</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>734</fpage>&#x2013;<lpage>743</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>87</volume> </citation>
</ref>
<ref id="B115">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Matl</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Narang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Bajcsy</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ramos</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Inferring the Material Properties of Granular media for Robotic Tasks</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name> (<publisher-loc>Paris, FranceMay 31 - August 31</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2770</fpage>&#x2013;<lpage>2777</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA40945.2020.9197063</pub-id> </citation>
</ref>
<ref id="B116">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Mehta</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Diaz</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Golemo</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Pal</surname>
<given-names>C. J.</given-names>
</name>
<name>
<surname>Paull</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Active Domain Randomization</article-title>,&#x201d; in <conf-name>Conference on Robot Learning (CoRL)</conf-name>, <conf-date>October 30 - November 1</conf-date> (<publisher-loc>Osaka, Japan</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>1162</fpage>&#x2013;<lpage>1176</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>100</volume> </citation>
</ref>
<ref id="B117">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Mehta</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Handa</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ramos</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>A User&#x2018;s Guide to Calibrating Robotics Simulators</article-title>,&#x201d; in <conf-name>Conference on Robot Learning (CoRL), Virtual Event</conf-name>, <conf-date>November 16 - 18</conf-date> (<publisher-name>PMLR</publisher-name>). <comment>Proceedings of Machine Learning Research</comment>. </citation>
</ref>
<ref id="B118">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Metropolis</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ulam</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>1949</year>). <article-title>The Monte Carlo Method</article-title>. <source>J. Am. Stat. Assoc.</source> <volume>44</volume>, <fpage>335</fpage>&#x2013;<lpage>341</lpage>. <pub-id pub-id-type="doi">10.1080/01621459.1949.10483310</pub-id> </citation>
</ref>
<ref id="B119">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mnih</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Kavukcuoglu</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Silver</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Rusu</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Veness</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bellemare</surname>
<given-names>M. G.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Human-level Control through Deep Reinforcement Learning</article-title>. <source>Nature</source> <volume>518</volume>, <fpage>529</fpage>&#x2013;<lpage>533</lpage>. <pub-id pub-id-type="doi">10.1038/nature14236</pub-id> </citation>
</ref>
<ref id="B120">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Molchanov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Honig</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Preiss</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Ayanian</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Sukhatme</surname>
<given-names>G. S.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Sim-to-(multi)-real: Transfer of Low-Level Robust Control Policies to Multiple Quadrotors</article-title>,&#x201d; in <conf-name>International Conference on Intelligent Robots and Systems (IROS)</conf-name>, <conf-date>November 3-8</conf-date> (<publisher-loc>Macau, SAR, China</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>59</fpage>&#x2013;<lpage>66</lpage>. <pub-id pub-id-type="doi">10.1109/IROS40897.2019.8967695</pub-id> </citation>
</ref>
<ref id="B121">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Mordatch</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Lowrey</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Todorov</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Ensemble-cio: Full-Body Dynamic Motion Planning that Transfers to Physical Humanoids</article-title>,&#x201d; in <conf-name>International Conference on Intelligent Robots and Systems (IROS)</conf-name>, <conf-date>September 28 - October 2</conf-date> (<publisher-loc>Hamburg, Germany</publisher-loc>, <fpage>5307</fpage>&#x2013;<lpage>5314</lpage>. <pub-id pub-id-type="doi">10.1109/IROS.2015.7354126</pub-id> </citation>
</ref>
<ref id="B122">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Morere</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Ott</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ramos</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Learning to Plan Hierarchically from Curriculum</article-title>. <source>IEEE Robot. Autom. Lett.</source> <volume>4</volume>, <fpage>2815</fpage>&#x2013;<lpage>2822</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2019.2920285</pub-id> </citation>
</ref>
<ref id="B123">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Mozian</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Camilo Gamboa Higuera</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Meger</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Dudek</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Learning Domain Randomization Distributions for Training Robust Locomotion Policies</article-title>,&#x201d; in <conf-name>International Conference on Intelligent Robots and Systems (IROS) Las Vegas</conf-name>, <conf-date>October 24 - January 24</conf-date> (<publisher-loc>NV, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>6112</fpage>&#x2013;<lpage>6117</lpage>. <pub-id pub-id-type="doi">10.1109/IROS45743.2020.9341019</pub-id> </citation>
</ref>
<ref id="B124">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Muratore</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Eilers</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gienger</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021a</year>). <article-title>Data-efficient Domain Randomization with Bayesian Optimization</article-title>. <source>IEEE Robot. Autom. Lett.</source> <volume>6</volume>, <fpage>911</fpage>&#x2013;<lpage>918</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2021.3052391</pub-id> </citation>
</ref>
<ref id="B125">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Muratore</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Gienger</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021b</year>). <article-title>Assessing Transferability from Simulation to Reality for Reinforcement Learning</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>43</volume>, <fpage>1172</fpage>&#x2013;<lpage>1183</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2019.2952353</pub-id> </citation>
</ref>
<ref id="B126">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Muratore</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Gruner</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Wiese</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Gienger</surname>
<given-names>B. B. M.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021c</year>). &#x201c;<article-title>Neural Posterior Domain Randomization</article-title>,&#x201d; in <conf-name>Conference on Robot Learning (CoRL), Virtual Event</conf-name>, <conf-date>November 8-11</conf-date> (<publisher-loc>London, England</publisher-loc>. </citation>
</ref>
<ref id="B127">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Muratore</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Treede</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Gienger</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Domain Randomization for Simulation-Based Policy Optimization with Transferability Assessment</article-title>,&#x201d; in <conf-name>Conference on Robot Learning (CoRL)</conf-name> (<publisher-loc>Z&#xfc;rich, SwitzerlandOctober 29-31</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>700</fpage>&#x2013;<lpage>713</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>87</volume> </citation>
</ref>
<ref id="B128">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Nagabandi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Clavera</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Fearing</surname>
<given-names>R. S.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Learning to Adapt in Dynamic, Real-World Environments through Meta-Reinforcement Learning</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR) New Orleans</conf-name>, <conf-date>May 6-9</conf-date> (<publisher-loc>LA, USA</publisher-loc>. <comment>OpenReview.net)</comment>. </citation>
</ref>
<ref id="B129">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ng</surname>
<given-names>A. Y.</given-names>
</name>
<name>
<surname>Jordan</surname>
<given-names>M. I.</given-names>
</name>
</person-group> (<year>2000</year>). &#x201c;<article-title>PEGASUS: a Policy Search Method for Large Mdps and Pomdps</article-title>,&#x201d; in <conf-name>UAI</conf-name>, <conf-date>June 30 - July 3</conf-date> (<publisher-loc>Stanford, California, USA</publisher-loc>: <publisher-name>Morgan Kaufmann</publisher-name>), <fpage>406</fpage>&#x2013;<lpage>415</lpage>. </citation>
</ref>
<ref id="B130">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>OpenAI</surname>
</name>
<name>
<surname>Akkaya</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Andrychowicz</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chociej</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Litwin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>McGrew</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Solving Rubik&#x2019;s Cube with a Robot Hand</article-title>. <comment>
<italic>arXiv</italic> 1910.07113</comment> </citation>
</ref>
<ref id="B131">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pan</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>A Survey on Transfer Learning</article-title>. <source>IEEE Trans. Knowl. Data Eng.</source> <volume>22</volume>, <fpage>1345</fpage>&#x2013;<lpage>1359</lpage>. <pub-id pub-id-type="doi">10.1109/TKDE.2009.191</pub-id> </citation>
</ref>
<ref id="B132">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Papamakarios</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Murray</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Fast <italic>&#x3f5;</italic>-free Inference of Simulation Models with Bayesian Conditional Density Estimation</article-title>,&#x201d; in <conf-name>Conference on Neural Information Processing Systems (NIPS)</conf-name>, <conf-date>December 5-10</conf-date> (<publisher-loc>Barcelona, Spain</publisher-loc>, <fpage>1028</fpage>&#x2013;<lpage>1036</lpage>. </citation>
</ref>
<ref id="B133">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Papamakarios</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Sterratt</surname>
<given-names>D. C.</given-names>
</name>
<name>
<surname>Murray</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Sequential Neural Likelihood: Fast Likelihood-free Inference with Autoregressive Flows</article-title>,&#x201d; in <conf-name>International Conference on Artificial Intelligence and Statistics (AISTATS)</conf-name>, <conf-date>April 16-18</conf-date> (<publisher-loc>Naha, Okinawa, Japan</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>837</fpage>&#x2013;<lpage>848</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>89</volume> </citation>
</ref>
<ref id="B134">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Parisotto</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Ba</surname>
<given-names>L. J.</given-names>
</name>
<name>
<surname>Salakhutdinov</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Actor-mimic: Deep Multitask and Transfer Reinforcement Learning</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR) San Juan</conf-name>, <conf-date>May 2-4</conf-date> (<publisher-loc>Puerto Rico</publisher-loc>. <comment>Conference Track</comment>. </citation>
</ref>
<ref id="B135">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Paul</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Osborne</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Whiteson</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Fingerprint Policy Optimisation for Robust Reinforcement Learning</article-title>. In <conf-name>International Conference on Machine Learning (ICML)</conf-name>, <publisher-loc>Long Beach California, USA</publisher-loc>, <conf-date>9-15 June</conf-date> (<publisher-name>PMLR</publisher-name>), vol. <volume>97</volume> </citation>
</ref>
<ref id="B136">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Peng</surname>
<given-names>X. B.</given-names>
</name>
<name>
<surname>Andrychowicz</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zaremba</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Sim-to-real Transfer of Robotic Control with Dynamics Randomization</article-title>,&#x201d; in <conf-name>International Conference on Robotics and Automation (ICRA)</conf-name>, <conf-date>May 21-25</conf-date> (<publisher-loc>Brisbane, Australia</publisher-loc>, <fpage>1</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2018.8460528</pub-id> </citation>
</ref>
<ref id="B137">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Perlin</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2002</year>). <article-title>Improving Noise</article-title>. <source>ACM Trans. Graph.</source> <volume>21</volume>, <fpage>681</fpage>&#x2013;<lpage>682</lpage>. <pub-id pub-id-type="doi">10.1145/566654.566636</pub-id> </citation>
</ref>
<ref id="B138">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>M&#xfc;lling</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Altun</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Relative Entropy Policy Search</article-title>,&#x201d; in <conf-name>AAAI Conference on Artificial Intelligence</conf-name>, <conf-date>July 11-15</conf-date> (<publisher-loc>Atlanta, Georgia, USA</publisher-loc>. </citation>
</ref>
<ref id="B139">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Pinto</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Andrychowicz</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Welinder</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Zaremba</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Asymmetric Actor Critic for Image-Based Robot Learning</article-title>,&#x201d; in <conf-name>Robotics: Science and Systems (RSS)</conf-name>, <conf-date>June 26-30</conf-date> (<publisher-loc>Pittsburgh, Pennsylvania, USA</publisher-loc>. <pub-id pub-id-type="doi">10.15607/RSS.2018.XIV.008</pub-id> </citation>
</ref>
<ref id="B140">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Pinto</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Davidson</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sukthankar</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Gupta</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Robust Adversarial Reinforcement Learning</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning (ICML)</conf-name>, <conf-date>August 6-11</conf-date> (<publisher-loc>Sydney, NSW, Australia</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>2817</fpage>&#x2013;<lpage>2826</lpage>. </citation>
</ref>
<ref id="B141">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Polvara</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Patacchiola</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hanheide</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Neumann</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Sim-to-real Quadrotor landing via Sequential Deep Q-Networks and Domain Randomization</article-title>. <source>Robotics</source> <volume>9</volume>, <fpage>8</fpage>. <pub-id pub-id-type="doi">10.3390/robotics9010008</pub-id> </citation>
</ref>
<ref id="B142">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Possas</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Barcelos</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Oliveira</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ramos</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Online Bayessim for Combined Simulator Parameter Inference and Policy Improvement</article-title>,&#x201d; in <conf-name>International Conference on Intelligent Robots and Systems (IROS) Las Vegas</conf-name>, <conf-date>October 24 - January 24</conf-date> (<publisher-loc>NV, USA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>5445</fpage>&#x2013;<lpage>5452</lpage>. <pub-id pub-id-type="doi">10.1109/IROS45743.2020.9341401</pub-id> </citation>
</ref>
<ref id="B143">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Radford</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Child</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Luan</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Amodei</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Sutskever</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2019</year>). <source>Language Models Are Unsupervised Multitask Learners</source>. </citation>
</ref>
<ref id="B144">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Rajeswaran</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ghotra</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ravindran</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Epopt: Learning Robust Neural Network Policies Using Model Ensembles</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR), Toulon</conf-name>, <conf-date>April 24-26</conf-date> (<publisher-loc>France</publisher-loc>. <comment>Conference Track (OpenReview.net)</comment>. </citation>
</ref>
<ref id="B145">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Rajeswaran</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mordatch</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>A Game Theoretic Framework for Model Based Reinforcement Learning</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning (ICML), Virtual Event</conf-name>, <conf-date>13-18 July</conf-date> (<publisher-name>PMLR</publisher-name>), <fpage>7953</fpage>&#x2013;<lpage>7963</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>119</volume> </citation>
</ref>
<ref id="B146">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ramos</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Possas</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Bayessim: Adaptive Domain Randomization via Probabilistic Inference for Robotics Simulators</article-title>,&#x201d; in <conf-name>Robotics: Science and Systems (RSS)</conf-name>, <conf-date>June 22-26</conf-date> (<publisher-loc>Germany</publisher-loc>: <publisher-name>Freiburg im Breisgau</publisher-name>). <pub-id pub-id-type="doi">10.15607/RSS.2019.XV.029</pub-id> </citation>
</ref>
<ref id="B147">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Rasmussen</surname>
<given-names>C. E.</given-names>
</name>
<name>
<surname>Williams</surname>
<given-names>C. K. I.</given-names>
</name>
</person-group> (<year>2006</year>). <source>
<italic>Gaussian Processes for Machine Learning</italic>. Adaptive Computation and Machine Learning</source>. <publisher-name>MIT Press</publisher-name>. </citation>
</ref>
<ref id="B148">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Rawlik</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Toussaint</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Vijayakumar</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2012</year>). <publisher-loc>Sydney, NSW, Australia</publisher-loc>: <publisher-name>RSS</publisher-name>. <pub-id pub-id-type="doi">10.15607/RSS.2012.VIII.045</pub-id>
<article-title>On Stochastic Optimal Control and Reinforcement Learning by Approximate Inference</article-title>
<conf-name>Robotics: Science and Systems</conf-name>
<conf-date>July 9-13</conf-date> </citation>
</ref>
<ref id="B149">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ruiz</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Schulter</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chandraker</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Learning to Simulate</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR)</conf-name>, <conf-date>May 6-9</conf-date> (<publisher-loc>New Orleans, LA, USA</publisher-loc>. <comment>(OpenReview.net)</comment>. </citation>
</ref>
<ref id="B150">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Russakovsky</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Krause</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Satheesh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Imagenet Large Scale Visual Recognition challenge</article-title>. <source>Int. J. Comput. Vis.</source> <volume>115</volume>, <fpage>211</fpage>&#x2013;<lpage>252</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-015-0816-y</pub-id> </citation>
</ref>
<ref id="B151">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Rusu</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Colmenarejo</surname>
<given-names>S. G.</given-names>
</name>
<name>
<surname>G&#xfc;l&#xe7;ehre</surname>
<given-names>&#xc7;.</given-names>
</name>
<name>
<surname>Desjardins</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Kirkpatrick</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pascanu</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2016a</year>). &#x201c;<article-title>Policy Distillation</article-title>,&#x201d; in (<publisher-loc>San Juan, Puerto Rico</publisher-loc>. <comment>Conference Track</comment>.<conf-name>International Conference on Learning Representations (ICLR)</conf-name>
<conf-date>May 2-4</conf-date> </citation>
</ref>
<ref id="B152">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rusu</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Rabinowitz</surname>
<given-names>N. C.</given-names>
</name>
<name>
<surname>Desjardins</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Soyer</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Kirkpatrick</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kavukcuoglu</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2016b</year>). <article-title>Progressive Neural Networks</article-title>. <comment>
<italic>arXiv</italic> 1606.04671</comment> </citation>
</ref>
<ref id="B153">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Rusu</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Vecerik</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Roth&#xf6;rl</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Heess</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Pascanu</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Hadsell</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Sim-to-real Robot Learning from Pixels with Progressive Nets</article-title>,&#x201d; in <conf-name>Conference on Robot Learning (CoRL), Mountain View</conf-name>, <conf-date>November 13-15</conf-date> (<publisher-loc>California, USA</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>262</fpage>&#x2013;<lpage>270</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>78</volume> </citation>
</ref>
<ref id="B154">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sadeghi</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Levine</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>CAD2RL: Real Single-Image Flight without a Single Real Image</article-title>,&#x201d; in <conf-name>Robotics: Science and Systems (RSS)</conf-name>, <conf-date>July 12-16</conf-date> (<publisher-loc>Cambridge, Massachusetts, USA</publisher-loc>. <pub-id pub-id-type="doi">10.15607/RSS.2017.XIII.034</pub-id> </citation>
</ref>
<ref id="B155">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Santoro</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bartunov</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Botvinick</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wierstra</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Lillicrap</surname>
<given-names>T. P.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Meta-learning with Memory-Augmented Neural Networks</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning (ICML)</conf-name>, <conf-date>June 19-24</conf-date> (<publisher-loc>New York City, NY, USA</publisher-loc>: <publisher-name>JMLR.org</publisher-name>), <fpage>1842</fpage>&#x2013;<lpage>1850</lpage>.<volume>48</volume> </citation>
</ref>
<ref id="B156">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Schulman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wolski</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Dhariwal</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Radford</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Klimov</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2017</year>). <source>Proximal Policy Optimization Algorithms</source>. <comment>
<italic>arXiv</italic> 1707.06347</comment>. </citation>
</ref>
<ref id="B157">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Siekmann</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Green</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Warila</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Fern</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hurst</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Blind Bipedal Stair Traversal via Sim-To-Real Reinforcement Learning</article-title>,&#x201d; in <conf-name>Robotics: Science and Systems (RSS), Virtual Event</conf-name>, <conf-date>July 12-16</conf-date>. <pub-id pub-id-type="doi">10.15607/RSS.2021.XVII.061</pub-id> </citation>
</ref>
<ref id="B158">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Silver</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Maddison</surname>
<given-names>C. J.</given-names>
</name>
<name>
<surname>Guez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sifre</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>van den Driessche</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Mastering the Game of Go with Deep Neural Networks and Tree Search</article-title>. <source>Nature</source> <volume>529</volume>, <fpage>484</fpage>&#x2013;<lpage>489</lpage>. <pub-id pub-id-type="doi">10.1038/nature16961</pub-id> </citation>
</ref>
<ref id="B159">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sunn&#xe5;ker</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Busetto</surname>
<given-names>A. G.</given-names>
</name>
<name>
<surname>Numminen</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Corander</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Foll</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Dessimoz</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Approximate Bayesian Computation</article-title>. <source>Plos Comput. Biol.</source> <volume>9</volume>, <fpage>e1002803</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1002803</pub-id> </citation>
</ref>
<ref id="B160">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sutanto</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>A. S.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Mukadam</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sukhatme</surname>
<given-names>G. S.</given-names>
</name>
<name>
<surname>Rai</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>Encoding Physical Constraints in Differentiable newton-euler Algorithm</article-title>,&#x201d; in <conf-name>L4DC, Virtual Event</conf-name>, <conf-date>11-12 June</conf-date> (<publisher-loc>Berkeley, CA, USA</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>804</fpage>&#x2013;<lpage>813</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>120</volume> </citation>
</ref>
<ref id="B161">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sutton</surname>
<given-names>R. S.</given-names>
</name>
</person-group> (<year>1991</year>). <article-title>Dyna, an Integrated Architecture for Learning, Planning, and Reacting</article-title>. <source>SIGART Bull.</source> <volume>2</volume>, <fpage>160</fpage>&#x2013;<lpage>163</lpage>. <pub-id pub-id-type="doi">10.1145/122344.122377</pub-id> </citation>
</ref>
<ref id="B162">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sutton</surname>
<given-names>R. S.</given-names>
</name>
<name>
<surname>Precup</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>1999</year>). <article-title>Between Mdps and Semi-mdps: A Framework for Temporal Abstraction in Reinforcement Learning</article-title>. <source>Artif. Intelligence</source> <volume>112</volume>, <fpage>181</fpage>&#x2013;<lpage>211</lpage>. <pub-id pub-id-type="doi">10.1016/S0004-3702(99)00052-1</pub-id> </citation>
</ref>
<ref id="B163">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Szegedy</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zaremba</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Sutskever</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Bruna</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Erhan</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Goodfellow</surname>
<given-names>I. J.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). &#x201c;<article-title>Intriguing Properties of Neural Networks</article-title>,&#x201d; in (<publisher-loc>Banff, Canada</publisher-loc>. <comment>Conference Track</comment>.<conf-name>International Conference on Learning Representations (ICLR)</conf-name>
<conf-date>April 14-16</conf-date> </citation>
</ref>
<ref id="B164">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Coumans</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Iscen</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hafner</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). &#x201c;<article-title>Sim-to-real: Learning Agile Locomotion for Quadruped Robots</article-title>,&#x201d; in <conf-name>Robotics: Science and Systems (RSS)</conf-name>, <conf-date>June 26-30</conf-date> (<publisher-loc>Pittsburgh, Pennsylvania, USA</publisher-loc>. <pub-id pub-id-type="doi">10.15607/RSS.2018.XIV.010</pub-id> </citation>
</ref>
<ref id="B165">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Teh</surname>
<given-names>Y. W.</given-names>
</name>
<name>
<surname>Bapst</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Czarnecki</surname>
<given-names>W. M.</given-names>
</name>
<name>
<surname>Quan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kirkpatrick</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hadsell</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). &#x201c;<article-title>Distral: Robust Multitask Reinforcement Learning</article-title>,&#x201d; in <conf-name>Conference on Neural Information Processing Systems (NIPS)</conf-name> (<publisher-loc>Long Beach, CA, USA</publisher-loc>, <fpage>4496</fpage>&#x2013;<lpage>4506</lpage>. </citation>
</ref>
<ref id="B166">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ting</surname>
<given-names>J.-A.</given-names>
</name>
<name>
<surname>D&#x2019;Souza</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Schaal</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Bayesian Robot System Identification with Input and Output Noise</article-title>. <source>Neural Networks</source> <volume>24</volume>, <fpage>99</fpage>&#x2013;<lpage>108</lpage>. <pub-id pub-id-type="doi">10.1016/j.neunet.2010.08.011</pub-id> </citation>
</ref>
<ref id="B167">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ting</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Mistry</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Schaal</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Nakanishi</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2006</year>). &#x201c;<article-title>A Bayesian Approach to Nonlinear Parameter Identification for Rigid Body Dynamics</article-title>,&#x201d; in <conf-name>Robotics: Science and Systems (RSS)</conf-name>, <conf-date>August 16-19</conf-date> (<publisher-loc>Philadelphia, Pennsylvania, USA</publisher-loc>: <publisher-name>The MIT Press</publisher-name>). <pub-id pub-id-type="doi">10.15607/RSS.2006.II.032</pub-id> </citation>
</ref>
<ref id="B168">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tobin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Fong</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ray</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zaremba</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Domain Randomization for Transferring Deep Neural Networks from Simulation to the Real World</article-title>,&#x201d; in <conf-name>International Conference on Intelligent Robots and Systems (IROS)</conf-name>, <conf-date>September 24-28</conf-date> (<publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>Canada</publisher-name>), <fpage>23</fpage>&#x2013;<lpage>30</lpage>. <pub-id pub-id-type="doi">10.1109/IROS.2017.8202133</pub-id> </citation>
</ref>
<ref id="B169">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>van den Oord</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Vinyals</surname>
<given-names>O.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Representation Learning with Contrastive Predictive Coding</article-title>. <comment>
<italic>arXiv</italic> 1807.03748</comment> </citation>
</ref>
<ref id="B170">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Van Parys</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kuhn</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Goulart</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Morari</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Distributionally Robust Control of Constrained Stochastic Systems</article-title>. <source>IEEE Trans. Automat. Contr.</source> <volume>61</volume>, <fpage>1</fpage>. <pub-id pub-id-type="doi">10.1109/TAC.2015.2444134</pub-id> </citation>
</ref>
<ref id="B171">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kurth-Nelson</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Soyer</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Leibo</surname>
<given-names>J. Z.</given-names>
</name>
<name>
<surname>Tirumala</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Munos</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). &#x201c;<article-title>Learning to Reinforcement Learn</article-title>,&#x201d; in <conf-name>Cognitive Science</conf-name>, <conf-date>16-29 July</conf-date> (<publisher-loc>London, UK</publisher-loc>. <comment>cognitivesciencesociety.org)</comment>. </citation>
</ref>
<ref id="B172">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Fleet</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Hertzmann</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Optimizing Walking Controllers for Uncertain Inputs and Environments</article-title>. <source>ACM Trans. Graphics</source> <volume>29</volume>, <fpage>73</fpage>&#x2013;<lpage>78</lpage>. <pub-id pub-id-type="doi">10.1145/1833351.177881010.1145/1778765.1778810</pub-id> </citation>
</ref>
<ref id="B173">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Watson</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Abdulsamad</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Findeisen</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Peters</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Stochastic Control through Approximate Bayesian Input Inference</article-title>. <comment>
<italic>arXiv</italic> 2105</comment>.<fpage>07693</fpage> </citation>
</ref>
<ref id="B174">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Williams</surname>
<given-names>R. J.</given-names>
</name>
</person-group> (<year>1992</year>). <article-title>Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning</article-title>. <source>Mach Learn.</source> <volume>8</volume>, <fpage>229</fpage>&#x2013;<lpage>256</lpage>. <pub-id pub-id-type="doi">10.1007/BF00992696</pub-id> </citation>
</ref>
<ref id="B175">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wittenmark</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>1995</year>). &#x201c;<article-title>Adaptive Dual Control Methods: An Overview</article-title>,&#x201d; in <conf-name>Adaptive Systems in Control and Signal Processing 1995</conf-name> (<publisher-name>Elsevier</publisher-name>), <fpage>67</fpage>&#x2013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1016/b978-0-08-042375-3.50010-x</pub-id> </citation>
</ref>
<ref id="B176">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wright</surname>
<given-names>R. D.</given-names>
</name>
<name>
<surname>Ramsay</surname>
<given-names>T. E.</given-names>
</name>
</person-group> (<year>1979</year>). <article-title>On the Effectiveness of Common Random Numbers</article-title>. <source>Manag. Sci.</source> <volume>25</volume>, <fpage>649</fpage>&#x2013;<lpage>656</lpage>. <pub-id pub-id-type="doi">10.1287/mnsc.25.7.649</pub-id> </citation>
</ref>
<ref id="B177">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yildirim</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Lim</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Freeman</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Tenenbaum</surname>
<given-names>J. B.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Galileo: Perceiving Physical Object Properties by Integrating a Physics Engine with Deep Learning</article-title>,&#x201d; in <conf-name>Conference on Neural Information Processing Systems (NIPS)</conf-name>, <conf-date>December 7-12</conf-date> (<publisher-loc>Montreal, Quebec, Canada</publisher-loc>, <fpage>127</fpage>&#x2013;<lpage>135</lpage>. </citation>
</ref>
<ref id="B178">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Xie</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Clary</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Dao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Morais</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Hurst</surname>
<given-names>J. W.</given-names>
</name>
<name>
<surname>van de Panne</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Learning Locomotion Skills for Cassie: Iterative Design and Sim-To-Real</article-title>,&#x201d; in <conf-name>Conference on Robot Learning (CoRL)</conf-name>, <conf-date>October 30 - November 1</conf-date> (<publisher-loc>Osaka, Japan</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>317</fpage>&#x2013;<lpage>329</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>100</volume> </citation>
</ref>
<ref id="B179">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xie</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Da</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>van de Panne</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Babich</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Garg</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Dynamics Randomization Revisited: A Case Study for Quadrupedal Locomotion</article-title>. <comment>
<italic>arXiv</italic> 2011.02404</comment> </citation>
</ref>
<ref id="B180">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yan</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Vangipuram</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Abbeel</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Pinto</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Learning Predictive Representations for Deformable Objects Using Contrastive Estimation</article-title>,&#x201d; in <conf-name>Conference on Robot Learning (CoRL), Virtual Event</conf-name>, <conf-date>November 16 - 18</conf-date> (<publisher-loc>Virtual Event/Cambridge, MA, USA</publisher-loc>: <publisher-name>PMLR</publisher-name>), <fpage>564</fpage>&#x2013;<lpage>574</lpage>. <comment>of Proceedings of Machine Learning Research</comment>.<volume>155</volume>. </citation>
</ref>
<ref id="B181">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Lyapunov Stability and strong Passivity Analysis for Nonlinear Descriptor Systems</article-title>. <source>IEEE Trans. Circuits Syst.</source> <volume>60</volume>, <fpage>1003</fpage>&#x2013;<lpage>1012</lpage>. <pub-id pub-id-type="doi">10.1109/TCSI.2012.2215396</pub-id> </citation>
</ref>
<ref id="B182">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>V. C.</given-names>
</name>
<name>
<surname>Turk</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>C. K.</given-names>
</name>
</person-group> (<year>2019a</year>). &#x201c;<article-title>Sim-to-real Transfer for Biped Locomotion</article-title>,&#x201d; in <conf-name>International Conference on Intelligent Robots and Systems (IROS)</conf-name>, <conf-date>November 3-8</conf-date> (<publisher-loc>Macau, SAR, China</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>3503</fpage>&#x2013;<lpage>3510</lpage>. <pub-id pub-id-type="doi">10.1109/IROS40897.2019.8968053</pub-id> </citation>
</ref>
<ref id="B183">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>C. K.</given-names>
</name>
<name>
<surname>Turk</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2019b</year>). &#x201c;<article-title>Policy Transfer with Strategy Optimization</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR)</conf-name>, <conf-date>May 6-9</conf-date> (<publisher-loc>New Orleans, LA, USA</publisher-loc>. <comment>Conference Track (OpenReview.net)</comment>. </citation>
</ref>
<ref id="B184">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Karen Liu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Turk</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Preparing for the Unknown: Learning a Universal Policy with Online System Identification</article-title>,&#x201d; in <conf-name>Robotics: Science and Systems (RSS)</conf-name>, <conf-date>July 12-16</conf-date> (<publisher-loc>Cambridge, Massachusetts, USA</publisher-loc>. <pub-id pub-id-type="doi">10.15607/RSS.2017.XIII.048</pub-id> </citation>
</ref>
<ref id="B185">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Boning</surname>
<given-names>D. S.</given-names>
</name>
<name>
<surname>Hsieh</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Robust Reinforcement Learning on State Observations with Learned Optimal Adversary</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations (ICLR), Virtual Event</conf-name>, <conf-date>May 3-7</conf-date> (<publisher-loc>Austria</publisher-loc>. <comment>OpenReview.net)</comment>. </citation>
</ref>
<ref id="B186">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>L. M.</given-names>
</name>
<name>
<surname>Plappert</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zaremba</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2020</year>). <source>Predicting Sim-To-Real Transfer with Probabilistic Dynamics Models</source>, <fpage>12864</fpage>. <comment>
<italic>arXiv</italic> 2009</comment>. </citation>
</ref>
<ref id="B187">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kuhn</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wiesemann</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Mathematical Foundations of Robust and Distributionally Robust Optimization</article-title>. <comment>
<italic>arXiv</italic> 2105.00760</comment> </citation>
</ref>
<ref id="B188">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Doyle</surname>
<given-names>J. C.</given-names>
</name>
</person-group> (<year>1998</year>). <source>Essentials of Robust Control</source>, <volume>104</volume>. <publisher-name>Prentice-Hall</publisher-name>. </citation>
</ref>
<ref id="B189">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhuang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Xi</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>A Comprehensive Survey on Transfer Learning</article-title>. <source>Proc. IEEE</source> <volume>109</volume>, <fpage>43</fpage>&#x2013;<lpage>76</lpage>. <pub-id pub-id-type="doi">10.1109/JPROC.2020.3004555</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>