<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Phys.</journal-id>
<journal-title>Frontiers in Physics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Phys.</abbrev-journal-title>
<issn pub-type="epub">2296-424X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1331849</article-id>
<article-id pub-id-type="doi">10.3389/fphy.2024.1331849</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Physics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>A reinforcement learning agent for head and neck intensity-modulated radiation therapy</article-title>
<alt-title alt-title-type="left-running-head">Stephens et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fphy.2024.1331849">10.3389/fphy.2024.1331849</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Stephens</surname>
<given-names>Hunter</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2564661/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Xinyi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1068838/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sheng</surname>
<given-names>Yang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/730672/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wu</surname>
<given-names>Qiuwen</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1031953/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ge</surname>
<given-names>Yaorong</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/515546/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Wu</surname>
<given-names>Q. Jackie</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1580277/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Radiation Oncology</institution>, <institution>Duke University</institution>, <addr-line>Durham</addr-line>, <addr-line>NC</addr-line>, <country>United States</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Department of Software and Information Systems</institution>, <institution>University of North Carolina at Charlotte</institution>, <addr-line>Charlotte</addr-line>, <addr-line>NC</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/995904/overview">Xun Jia</ext-link>, Johns Hopkins Medicine, United States</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1949317/overview">Yin Gao</ext-link>, University of Texas Southwestern Medical Center, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1633979/overview">William Hrinivich</ext-link>, Johns Hopkins University, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Q. Jackie Wu, <email>jackie.wu@duke.edu</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>01</day>
<month>02</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>12</volume>
<elocation-id>1331849</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>11</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>15</day>
<month>01</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Stephens, Li, Sheng, Wu, Ge and Wu.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Stephens, Li, Sheng, Wu, Ge and Wu</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Head and neck (HN) cancers pose a difficult problem in the planning of intensity-modulated radiation therapy (IMRT) treatment. The primary tumor can be large and asymmetrical, and multiple organs at risk (OARs) with varying dose-sparing goals lie close to the target volume. Currently, there is no systematic way of automating the generation of IMRT plans, and the manual options face planning quality and long planning time challenges. In this article, we present a reinforcement learning (RL) model for the purposes of providing automated treatment planning to reduce clinical workflow time as well as providing a better starting point for human planners to modify and build upon. Several models with progressing complexity are presented, including the relevant plan dosimetry analysis and model interpretations of the resulting strategies learned by the auto-planning agent. Models were trained on a set of 40 patients and validated on a set of 20 patients. The presented models are shown to be consistent with the requirements of an RL model to be underpinned by a Markov decision process (MDP). In-depth interpretability of the models is presented by examination of the decision space using action hyperplanes. The auto-planning agent was able to generate plans with superior reduction in the mean dose of the left and right parotid glands by approximately 7&#xa0;Gy <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.5&#xa0;Gy (<italic>p</italic> &#x3c; 0.01) over a starting, static template plan with only pre-defined general prescription information. RL plans were comparable to a human expert&#x2019;s clinical plans for the primary (44&#xa0;Gy), boost (26&#xa0;Gy) , and the summed plans (70&#xa0;Gy) with <italic>p</italic>-values of 0.43, 0.72, and 0.67, respectively, for the dosimetric endpoints and uniform target coverage normalization. The RL planning agent was able to produce the plans used in validation in an average of 13.58&#xa0;min, with a minimum and a maximum planning time of 2.27 and 44.82&#xa0;min, respectively.</p>
</abstract>
<kwd-group>
<kwd>reinforcement learning</kwd>
<kwd>radiation therapy</kwd>
<kwd>automated treatment planning</kwd>
<kwd>head and neck cancer</kwd>
<kwd>interpretable machine learning</kwd>
</kwd-group>
<contract-sponsor id="cn001">National Institutes of Health<named-content content-type="fundref-id">10.13039/100000002</named-content>
</contract-sponsor>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Medical Physics and Imaging</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>The designing process of a radiation therapy treatment plan for head and neck (HN) cancers can be time-consuming. The proximity of critical organs to a usually large and asymmetric primary target volume (PTV) leads to numerous trade-offs between sparing adjacent organs at risk (OARs) and healthy tissue and delivering the prescribed radiation dosage to the tumor. These trade-offs are usually based on more complex dosimetric endpoints other than simply the minimum or maximum dose limits, in particular the mean or median dose for the parotid glands and oral cavity. The parotid glands are important to spare at the risk of severe xerostomia or inadequate salivary function. Complications from xerostomia include poor dental hygiene, oral infections, sleep disturbances, pain, and difficulty chewing and swallowing [<xref ref-type="bibr" rid="B1">1</xref>]. This must be considered as the target must be treated, but complications should be avoided for the long-term health of the patient. Radiobiological and post-treatment studies have shown that severe xerostomia can be avoided by limiting at least one of the glands&#x2019; mean dose to less than 20&#xa0;Gy or the dose of both glands to less than 25&#xa0;Gy [<xref ref-type="bibr" rid="B2">2</xref>]. Over dosing of the oral cavity can lead to severe complications or oral mucositis that can have a very strong, negative impact on the patient&#x2019;s quality of life [<xref ref-type="bibr" rid="B3">3</xref>]. To avoid these side effects, Wang et al. [<xref ref-type="bibr" rid="B4">4</xref>] recommends that the oral cavity outside of the PTV should have a mean dose of less than 41.8&#xa0;Gy, which is associated with a significant reduction in oral mucositis as compared to 58.8&#xa0;Gy.</p>
<p>The highly conformal and sharp gradient distributions from intensity-modulated radiation therapy (IMRT) have been shown to have a significant improvement in parotid gland sparing over 3D conformal therapy [<xref ref-type="bibr" rid="B5">5</xref>&#x2013;<xref ref-type="bibr" rid="B7">7</xref>]. This is due to the fact that modulation of the radiation fields can be optimized given a specific objective set by solving an inverse optimization problem based on the dose deposition matrices of the treatment field set. However, although parotid glands are anatomically symmetric, relative to each other, it is common for the parotid glands to not be symmetric about the PTV due to the irregularity of the defined target. It is possible for one parotid gland to be more proximal to the target and/or have a larger overlap volume. This leads to the difficulty in sparing the two parotids evenly. A more proximal gland may not be able to meet the dose objectives, while the other could have a more optimal dose distribution than prescribed. In this case, the dose objectives can be removed or relaxed from one gland to enhance the sparing of the other [<xref ref-type="bibr" rid="B8">8</xref>]. This is commonly referred to single-side versus bi-lateral sparing. It is usually determined by the physician by examining the spatial features of the parotid glands in relation to the PTV. While no particular protocol is used to determine single-side or bi-lateral sparing, there have been many methods developed to predict the possible sparing of OARs determined by anatomical features and past plans [<xref ref-type="bibr" rid="B7">7</xref>, <xref ref-type="bibr" rid="B9">9</xref>&#x2013;<xref ref-type="bibr" rid="B13">13</xref>]. Furthermore, it has been shown that the predicted median dose is suitable as a criterion for choosing single-sided or bi-lateral sparing [<xref ref-type="bibr" rid="B14">14</xref>].</p>
<p>Different PTVs in HN cancer often have several dose levels. The first is to a larger target volume that includes the entire region to be treated and is usually prescribed a dose of approximately 44&#xa0;Gy. The second is to a smaller region within the large target region that will receive another boosted dose usually of approximately 26&#xa0;Gy. Together, these two planning schemes create a volume with a prescription of 44&#xa0;Gy and a prescription of 70&#xa0;Gy to the smaller contained boosted region and are noted as the primary, boost, and plan sum, respectively. There are two primary strategies for achieving this: simultaneous integrated boost (IMRT-SIB) and sequential (IMRT-SEQ). IMRT-SIB achieves the treatment plan by treating the primary and boosted target simultaneously, while IMRT-SEQ treats them as two separate plans. Both methods have been shown to have similar survival rates [<xref ref-type="bibr" rid="B15">15</xref>], and thus for this study, IMRT-SEQ will be used as is consistent with our institutions&#x2019; current practice.</p>
<p>While this potential exists, the complex nature of the planning process coupled with the trial-and-error tuning of planning objectives results in a plan quality that is highly correlated to the planner&#x2019;s experience [<xref ref-type="bibr" rid="B16">16</xref>]. This has led to a large influx of research into aiding this planning process using machine learning techniques [<xref ref-type="bibr" rid="B17">17</xref>]. One of the more seminal and important tools is knowledge-based planning (KBP), which aims to estimate certain aspects of a plan such as the dose distribution and dose&#x2013;volume histograms (DVHs). This method has been widely used and studied [<xref ref-type="bibr" rid="B18">18</xref>&#x2013;<xref ref-type="bibr" rid="B23">23</xref>]. Perhaps, the most optimistic application of machine learning to automatic planning can be found using reinforcement learning (RL), in which a machine seeks to mimic the decision processes of an expert planner. While in the nascent stages of development, RL has shown some promising results in modifying prostate plans where an intermediate plan was given as the input and an optimal strategy predicted [<xref ref-type="bibr" rid="B24">24</xref>, <xref ref-type="bibr" rid="B25">25</xref>]. Again with prostate plans, automatic planning was shown to have success using deep reinforcement learning to modify plan parameters [<xref ref-type="bibr" rid="B26">26</xref>]. RL was also used for non-small-cell lung cancer; however, this application relied on a 3D dose prediction engine [<xref ref-type="bibr" rid="B27">27</xref>]. Neither of these applications seemed to demonstrate <italic>de novo</italic> plan creation and/or relied on methods with little to no interpretability. While deep learning methods have shown very positive and encouraging results, there is a lack of interpretability and sometimes a requirement of a large amount of data for the models to train properly. Another good and more relevant example of interpretable RL is found in the work done by Zhang et <italic>al.</italic> [<xref ref-type="bibr" rid="B28">28</xref>] in the development of an auto-planning agent for stereotactic body radiation therapy (SBRT) for pancreatic cancer. The action space used in this model consisted of increasing and decreasing the maximum or minimum dose values so that the state and action space could be easily interpreted. However, more often, objectives other than the minimum and maximum dose are used in planning, and a more robust action space is needed as is the case with HN cancers. At this time, there exists no technology for creating IMRT plans from scratch, which can handle the complexity of HN cancer treatment with multiple goals and provide an insight into the strategy used by the planning agent. Therefore, this work aims to explore the development and investigation of an RL model for the purposes of HN IMRT planning.</p>
</sec>
<sec sec-type="methods" id="s2">
<title>2 Methods</title>
<sec id="s2-1">
<title>2.1 Model definitions and transition probabilities</title>
<p>RL can be modeled as a Markov decision process (MDP). An MDP is a 4-tuple, <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>P</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>, where<list list-type="simple">
<list-item>
<p>&#x2022; <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a set of states which defines the environment in which the agent operates in.</p>
</list-item>
<list-item>
<p>&#x2022; <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a set of actions in which the agent can operate on the environment inducing some change in state and reward.</p>
</list-item>
<list-item>
<p>&#x2022; <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> defines the state action probability transitions, that is</p>
</list-item>
<list-item>
<p>&#x2022; <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>Pr</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x2192;</mml:mo>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</p>
</list-item>
<list-item>
<p>&#x2022; <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> defines the reward observed from a change of state induced by an action.</p>
</list-item>
</list>
</p>
<p>Each of these will be defined for this model in the following sections. The following notation will be adopted. The state will be denoted by a vector, <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo>&#x2286;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">R</mml:mi>
<mml:mi>n</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>; the action by some integer, <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>A</mml:mi>
<mml:mo>&#x2286;</mml:mo>
<mml:msup>
<mml:mi mathvariant="double-struck">Z</mml:mi>
<mml:mo>&#x2b;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>; and the reward by some real-valued number, <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo mathvariant="double-struck">&#x2208;</mml:mo>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>The RL process can be described as the iterative interaction of an agent with an environment. The environment is defined by some current state observed by the agent. Ideally, this should include all the information available to the agent to make informed decisions. In the treatment planning process, a human planner mainly observes three objects of information. The first is the current dose distribution. Irrespective of the current plan configuration (beam settings, IMRT objectives, etc.), there is some resulting dose distribution. A human planner can observe the full distribution or the individual DVHs for OARs of interest. Most of this information is unnecessary or unprocessable for a human. Simply including all the information into the state vector would exponentially increase the model size, leading to an intractable problem. For instance, the entire dose distribution is defined at every point within a 3-D volume and can contain thousands of data points. Thus, in the proposed RL model, a dosimetric summary for each structure of interest will be included in the state definition. For the parotid glands and oral cavity, both the mean and median doses are included in the state, and for the PTV, the doses at 95% and 1% volume are included to summarize the target&#x2019;s coverage and hotspot. These are included to ensure that the target is sufficiently treated (coverage) and that the dose is not too high (hotspot) to cause complications. The second piece of information is the current objective set for the plan. When deciding whether to move a planning objective for a structure, a human planner will take into consideration where the current objective is. If there is not much difference in the objective and the current dosimetry, then that objective could possibly be pushed further. The converse is true as well, in that a large difference between the objective and current dosimetric state could indicate that the objective movement would not have a large impact on the change of state. A third piece of information is the spatial features of the patient&#x2019;s anatomy. The size and proximity of critical organs to the PTV play a large role in how well the organ can be spared. This would impact how much a human planner would need to decrease the dose for a particular organ. For example, if one of the parotids had a substantial overlap with the target, then getting the mean dose below 25&#xa0;Gy would be quite difficult without substantially lessening the coverage of the target. For the purposes of the current study, the spatial features are not directly represented in the state definition. It is assumed that this information is inherently encoded into the dose deposition matrix as the deposition matrix is a function of the anatomy distribution. This can be reasoned from the fact that the optimization, which is driven by the dose deposition matrix, will have certain responses based on the spatial characteristics of the medium. The agent will be assumed to gain the spatial information from the response of the optimization. The environment state is then defined as Eq. <xref ref-type="disp-formula" rid="e1">1</xref> follows:<disp-formula id="e1">
<mml:math id="m11">
<mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mo>%</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mn>95</mml:mn>
<mml:mo>%</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>T</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>%</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>T</mml:mi>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <inline-formula id="inf11">
<mml:math id="m12">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> represents a specific structure of interest, <inline-formula id="inf12">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the current position of the dose objective for structure <inline-formula id="inf13">
<mml:math id="m14">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf14">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the current position of the volume objective for structure <inline-formula id="inf15">
<mml:math id="m16">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. This formulation also holds for multiple objectives for a single structure as these will just be appended beside the first objective in order.</p>
<p>When the agent interacts with the environment, it takes certain actions and then observes the change in state. These actions must be encoded into the RL problem appropriately. During manual planning, the planner is iteratively moving the objectives on structures, with the ultimate goal of reaching an overall optimal dose distribution balancing all objectives, that is, the dose and volume of an objective are being changed whether increased or decreased. With the model developed and investigated by Zhang [<xref ref-type="bibr" rid="B28">28</xref>], the agent was allowed to increase and decrease the maximum dose objective only. For this investigation, an additional action of increasing or decreasing the volume objective will be added to the objectives that are not linked to the maximum dose only. This can be visualized as the moving of an objective in the dose&#x2013;volume space of a structure&#x2019;s DVH which mimics the actions of a human planner. This, in reality, is a continuous action but will be encoded as a discrete action by Eq. <xref ref-type="disp-formula" rid="e2">2</xref>
<disp-formula id="e2">
<mml:math id="m17">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#x394;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="equ1">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#x394;</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>where <inline-formula id="inf16">
<mml:math id="m19">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mtext>&#x2009;and&#x2009;</mml:mtext>
<mml:mo>&#x394;</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are discrete values. In the current study, we have experimentally set <inline-formula id="inf17">
<mml:math id="m20">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>G</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf18">
<mml:math id="m21">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:mi>V</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>5</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
<p>A critical component of the MDP is the governance of the underlying transition probabilities. For the information or strategy to be learned, there must be some order to the dynamics of the states under the influence of actions. This does not mean that a certain outcome is guaranteed given an action while in a certain state, but that the transition of the state is governed by some well-defined probability distribution. In order to define and investigate this dynamics, different portions of the state will be investigated independently. First, the portion of the state that describes the current location of an objective has completely deterministic dynamics, and is described in Eq. <xref ref-type="disp-formula" rid="e3">3</xref>.<disp-formula id="e3">
<mml:math id="m22">
<mml:mrow>
<mml:mi>Pr</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#x394;</mml:mo>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#x394;</mml:mo>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="equ2">
<mml:math id="m23">
<mml:mrow>
<mml:mi>Pr</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2192;</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#x394;</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#xb1;</mml:mo>
<mml:mo>&#x394;</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The transition is more complex for the dosimetric portion of the state. First, this portion of the state is continuous. Thus, the transition will be defined in Eq. <xref ref-type="disp-formula" rid="e4">4</xref> as some perturbation of the current portion of the state,<disp-formula id="e4">
<mml:math id="m24">
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where <inline-formula id="inf19">
<mml:math id="m25">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> indicates a dosimetric element of the state. Then, the entire transition can be characterized by the perturbation which we will consider a continuous random variable defined by the probability distribution, <inline-formula id="inf20">
<mml:math id="m26">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Note that the definition of the probability is dependent on the current state and the action taken.</p>
<p>One of the important properties of an MDP is that the states are Markovian, that is, the state transition probability is only a function of the current state and no other past states. More formally in Eq. <xref ref-type="disp-formula" rid="e5">5</xref>,<disp-formula id="e5">
<mml:math id="m27">
<mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>o</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mi>&#x3b4;</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>This intuitively holds as the optimization problem is only a function of the current objectives and state and is agnostic to any past states or decisions. In addition, the spatial features do not vary significantly across patients so that these features do not cause significant changes in the system dynamics.</p>
<p>The reward function will inform the agent of the effect of an action. In some formulations, a reward or penalty is not given for every action and is only given for reaching a determined endpoint like winning or losing a game. Due to the complexity and size of our problem, however, we will formulate the reward function in a way to speed up the convergence. In this formulation, a reward or penalty will be given to the agent based on the effect of the current action. This will be determined based on some plan loss function. This loss function will calculate the cost of the current state of a plan and will consist of penalties for not hitting certain goals. These penalties will include for the PTV not reaching <inline-formula id="inf21">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mn>95</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>44</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>G</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and for the OARs not meeting the prescription for the mean doses. This model will only take into account a single base plan with no boost and will, therefore, scale the prescriptions for the parotids and oral cavity to 15&#xa0;Gy and 25&#xa0;Gy, respectively. The loss function, <inline-formula id="inf22">
<mml:math id="m29">
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, can then be written as the sum of individual loss functions, where the individual structure loss functions are simply the relative difference between the actual dosimetric quantity and the goal. Finally, the reward at some time <inline-formula id="inf23">
<mml:math id="m30">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is given as follows in Eq. <xref ref-type="disp-formula" rid="e6">6</xref>:<disp-formula id="e6">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="script">L</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi mathvariant="script">L</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-2">
<title>2.2 Q-function and model updating</title>
<p>The Q-function calculates the quality of a state&#x2013;action pair, that is, being in state <inline-formula id="inf24">
<mml:math id="m32">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, it assesses the quality of taking action <inline-formula id="inf25">
<mml:math id="m33">
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>. For discrete state and action spaces, <inline-formula id="inf26">
<mml:math id="m34">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> can simply be a matrix. However, since the state space is continuous, <inline-formula id="inf27">
<mml:math id="m35">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> must be approximated by a function. The approximation for <inline-formula id="inf28">
<mml:math id="m36">
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> used in this study is defined as follows in Eq. <xref ref-type="disp-formula" rid="e7">7</xref>:<disp-formula id="e7">
<mml:math id="m37">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>Q</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>In this definition, <inline-formula id="inf29">
<mml:math id="m38">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a weighting vector and <inline-formula id="inf30">
<mml:math id="m39">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> can be looked at as a selector function defined in Eq. <xref ref-type="disp-formula" rid="e8">8</xref>.<disp-formula id="e8">
<mml:math id="m40">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x2297;</mml:mo>
<mml:msub>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>With these definitions, the learning procedure follows the state&#x2013;action&#x2013;reward&#x2013;state&#x2013;action (SARSA) algorithm. With a state and action, a reward as well as the following state&#x2013;action pair are observed. The weighting matrix is then updated by Eq. <xref ref-type="disp-formula" rid="e9">9</xref>
<disp-formula id="e9">
<mml:math id="m41">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mover accent="true">
<mml:mi>Q</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mover accent="true">
<mml:mi>Q</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>where <inline-formula id="inf31">
<mml:math id="m42">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf32">
<mml:math id="m43">
<mml:mrow>
<mml:mi>&#x3b3;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are hyper-parameters, namely, learning rate and discount factor, respectively.</p>
<p>In this learning scheme, the policy of choosing an action in a given state is bootstrapped. Ultimately, the agent would select the best available action for a given state. However, at the beginning, the agent has very little idea of how to act. Thus, at the beginning, the actions are mostly random. As the agent learns more, the rate at which actions are taken randomly should decrease, allowing more informed choices. This continues until the end of learning where the agent will be taking mostly informed actions with a smaller chance of exploration. The policy of action-taking is then formulated as in Eq. <xref ref-type="disp-formula" rid="e10">10</xref>, with random variables <inline-formula id="inf33">
<mml:math id="m44">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x223c;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>U</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
<disp-formula id="e10">
<mml:math id="m45">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3c0;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="center">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x223c;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>U</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
<mml:mtd>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:munder>
<mml:mi>max</mml:mi>
<mml:mi>a</mml:mi>
</mml:munder>
<mml:mover accent="true">
<mml:mi>Q</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
<mml:mtd>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x2265;</mml:mo>
<mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
<p>where <inline-formula id="inf34">
<mml:math id="m46">
<mml:mrow>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the probability of taking a random action against an informed one. At the beginning of learning, it should be very high and decrease to some final probability, <inline-formula id="inf35">
<mml:math id="m47">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:mi>&#x221e;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. For this scheme, the following expression Eq. <xref ref-type="disp-formula" rid="e11">11</xref> for <inline-formula id="inf36">
<mml:math id="m48">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is adopted.<disp-formula id="e11">
<mml:math id="m49">
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3f5;</mml:mi>
<mml:mi>&#x221e;</mml:mi>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:msup>
<mml:mi>t</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
<mml:msup>
<mml:mi>N</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mfrac>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:mi>&#x221e;</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:mi>&#x221e;</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>where <inline-formula id="inf37">
<mml:math id="m50">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a set number of iterations. All the iterations involved with one plan are considered an episode, and an epoch is where all the episodes for the plans have been performed. Thus, <inline-formula id="inf38">
<mml:math id="m51">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>e</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The learning scheme involves applying some starting template to a plan and then taking actions based on the current policy.</p>
</sec>
<sec id="s2-3">
<title>2.3 Q-function action hyperplanes</title>
<p>Given the above definition of the Q-function, it can also be appropriate to represent it as a matrix equation for interpretation purposes, that is, <inline-formula id="inf39">
<mml:math id="m52">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf40">
<mml:math id="m53">
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is a weighting matrix instead of a weighting vector. Now, the Q-function can now be written as follows in Eq. <xref ref-type="disp-formula" rid="e12">12</xref>:<disp-formula id="e12">
<mml:math id="m54">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>Q</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mi>a</mml:mi>
<mml:mi>T</mml:mi>
</mml:msubsup>
<mml:mi mathvariant="bold-italic">W</mml:mi>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>where <inline-formula id="inf41">
<mml:math id="m55">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3b4;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. In the situation where the policy is to choose the best possible action at each state, the agent will select <inline-formula id="inf42">
<mml:math id="m56">
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> that maximizes <inline-formula id="inf43">
<mml:math id="m57">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>Q</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Therefore, we can look at the boundary between two actions, <inline-formula id="inf44">
<mml:math id="m58">
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf45">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The boundary separating the space where each one would be more optimal over the other for a given state is given by <inline-formula id="inf46">
<mml:math id="m60">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>Q</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mover accent="true">
<mml:mi>Q</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>a</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, which gives in Eq. <xref ref-type="disp-formula" rid="e13">13</xref>,<disp-formula id="e13">
<mml:math id="m61">
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mi>k</mml:mi>
</mml:munder>
</mml:mstyle>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>
</p>
<p>The above equation is of a plane in hyperspace. On one side of the plane, one action is preferable, and on the other side, the other action is preferable for the given state.</p>
</sec>
<sec id="s2-4">
<title>2.4 Model training</title>
<p>Two different RL models were trained using an in-house dose and fluence calculation engine [<xref ref-type="bibr" rid="B29">29</xref>]. Each model is at the center of an auto-planning agent that controls the dose and fluence calculation engine by manipulating the dose&#x2013;volume objectives to generate optimal treatment plans. The first (model 1) is a model in which the agent could move the dose value of the objective up or down. The second (model 2) was a model in which the agent could move both the dose and volume values of the objective. This second model gave the agent full control as a human planner would have. Under the approval of the institutional IRB protocol, a training set of 40 patients was used for training and a separate set of 20 patients for validation. A sensitivity test was also performed on model 1 using a dataset of only 15 patients to examine the effect the size of the training set has on model performance. The patient data consisted of the CT images and structure sets and were completely anonymized with no personal identifiers present. The dataset contained an even mixture of plans where one of the parotids was in closer contact or proximity with the target. The distributions of the overlap with the target and the median distance from the target were essentially equal between the left and right parotids, and thus no obvious bias was present in the dataset. The overall goal present in the reward function was to try and meet the dosimetric goals for the left and right parotids (LP/RP) and the oral cavity (OC). These goals were defined as <inline-formula id="inf47">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf48">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> less than 15&#xa0;Gy for the LP and RP and 25&#xa0;Gy for the OC, where <inline-formula id="inf49">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> stands for the dose received by 50% of the volume or the median dose and <inline-formula id="inf50">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> stands for the mean of the entire dose distribution for the OAR.</p>
<p>The training consisted of a series of episodes within multiple epochs. An episode is defined as the agent taking actions on one particular plan. After the agent performs a number of actions on a particular plan, it moves on to the next plan. If all plans have been iterated over, the epoch is over, and the agent may start again. At the beginning of each episode, each plan is set with a set of initial template objectives. This is to ensure that there are distinct starting points for corresponding episodes across epochs. The initial template is static for the PTV objectives, always setting the lower and upper bounds at the same point. The starting template also contained maximum dose objectives on the spinal cord and larynx along with a normal tissue objective (NTO). A maximum dose objective on the larynx is not common and is used here only to keep the agent from sacrificing it for larger gains toward the goals. For the organs investigated, an objective was placed for <inline-formula id="inf51">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mn>50</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to be no greater than the organ-specific goal (15&#xa0;Gy for the parotids, 25&#xa0;Gy for the oral cavity). The models were then trained using the SARSA algorithm [<xref ref-type="bibr" rid="B30">30</xref>]. Both models were trained using the full training set.</p>
</sec>
<sec id="s2-5">
<title>2.5 Model analysis</title>
<p>To ensure that the model definitions were consistent with those of an MDP, state&#x2013;action transition probability functions were investigated by sampling state transitions under certain actions from the training data. Then, for a given action, the probability density function for the state transitions in question was estimated using kernel density estimation [<xref ref-type="bibr" rid="B31">31</xref>]. Finally, the dependence of the state change on the elements of the current state will be measured using a Spearman&#x2019;s rank correlation coefficient.</p>
<p>To investigate the sensitivity of training set size on model training, the results from model 1 trained on the small dataset (<italic>N</italic> &#x3d; 15) were compared to those on the large dataset (<italic>N</italic> &#x3d; 40). With equal weighting between the left and right parotids, any bias was quantified by comparing the resulting Q-function between the small and large set. Quantification of the bias was performed in two ways. The first was by observing the magnitudes of the state&#x2013;action pairs of the Q-function weighting matrix throughout training. For example, <inline-formula id="inf52">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> where <inline-formula id="inf53">
<mml:math id="m68">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> corresponds to the action of lowering the left parotid dose and <inline-formula id="inf54">
<mml:math id="m69">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> corresponds to the mean dose of the left parotid (i.e., the <inline-formula id="inf55">
<mml:math id="m70">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> th element of the state vector). The second is by examining the slope of the decision plane between the actions of lowering the left or right parotid dose throughout training. This is found by projecting onto the portion of the state pertaining to only the left and right parotid mean dose. More explicitly in Eq. <xref ref-type="disp-formula" rid="e14">14</xref>,<disp-formula id="e14">
<mml:math id="m71">
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:msub>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>where <inline-formula id="inf56">
<mml:math id="m72">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> pertains to lowering the left parotid dose, <inline-formula id="inf57">
<mml:math id="m73">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> pertains to lowering the right parotid dose, <inline-formula id="inf58">
<mml:math id="m74">
<mml:mrow>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> refers to the portion of the state vector containing the current left parotid mean dose, <inline-formula id="inf59">
<mml:math id="m75">
<mml:mrow>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> refers to the portion of the state vector containing the current right parotid mean dose, and <inline-formula id="inf60">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf61">
<mml:math id="m77">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:msub>
<mml:mi>k</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> refer to the current left and right parotid mean dose, respectively.</p>
<p>For each of the models, the Q-function was investigated using action hyperplanes. This analysis investigated the structure of the weighting matrix of the final Q-function and interpreted how the agent will act given that weighting matrix. The individual model performance and its ability to plan on new cases were investigated by having the agent create plans and then compare model plans with corresponding clinical plans. During planning, the agent continued to take actions until all goals were met or a maximum number of actions were met. The maximum number of actions is set to ensure the agent has ample time to meet all goals and was set to be 35. The agent-created plan was compared to clinical plans in two scenarios. For both scenarios, the agent was given the task to devise two separate plans for each case. The primary plan was a 44&#xa0;Gy prescription to the primary PTV, and the boost plan was a 26&#xa0;Gy prescription to the boost PTV. A comparison was also performed using the plan sums, which was simply the summation of the 44&#xa0;Gy and 26&#xa0;Gy plans. In the first scenario, no plan-specific goals were included, and the agent simply planned using the learned models. In the second scenario, the states were scaled to incorporate plan-specific goals for the parotids and oral cavity that were used in the clinical plans. The scaling was performed by scaling the dosimetric value associated with each goal with the difference between the plan-specific goal and the original goal for which the agent was trained on (i.e., 15&#xa0;Gy for parotids and 25&#xa0;Gy for oral cavity). For instance, consider that the goal for the left parotid is a mean dose of 12&#xa0;Gy. In some state where the actual mean dose of the left parotid is 15&#xa0;Gy, the agent would see this as the goal being met. Thus, the mean dose of 15&#xa0;Gy must be scaled such that the agent knows it is still 3&#xa0;Gy away from obtaining the goal. It must be noted that this scaling is only performed during validation when the agent is planning on plans not used in the training set. These plan-specific goals must be determined prior to either physician prescription and preference or some determined base case scenario for the organ of interest. For the validation plans, the plan-specific goals were taken as the final mean dose for the organs in the corresponding clinical plan.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<sec id="s3-1">
<title>3.1 State&#x2013;action transition probabilities</title>
<p>The state and action transition probability distributions were found to behave in an intuitive manner. The increasing and decreasing of the dose objective produced translations in the distributions of the change in the dosimetric state roughly around the amount the objective was changed. The distributions were Gaussian with a mean of just under <inline-formula id="inf62">
<mml:math id="m78">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>G</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> for the median dose and less than <inline-formula id="inf63">
<mml:math id="m79">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>0.5</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>G</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> for the mean dose. The change in the mean dose was not as strong as that of the median dose when increasing or decreasing the dose objective. This is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. Changing the volume was less predictable, but it still acted as expected in the fact that increasing and decreasing the volume objective had a similar effect on the dosimetric state. The change in the state was shown to not be independent of the current state. Furthermore, partitioning of the data into the first and last 20% of transitions showed that when changing the volume in the first 20% of transitions, the resulting change in the dose was higher than that in the last 20% of transitions. This was due to the fact that the starting position of the volume objective was higher in the first 20% than in the last 20% of transitions. This is shown by analyzing the correlation of both the change in the dose and the position of the volume objective when increasing or decreasing that objective and is demonstrated in <xref ref-type="fig" rid="F2">Figure 2</xref>. The change in the volume had a stronger response when the initial volume objective was higher, and these two variables showed strong correlations with each other with a correlation of 0.68 and 0.72 for increasing and decreasing the volume, respectively. The vast majority of correlations between the change in the state and the current state were almost 0 besides these two instances. The response of the dose to changing the dose objective behaved in the same manner as with changing the volume.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Distributions of the change in <inline-formula id="inf64">
<mml:math id="m80">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mn mathvariant="bold">50</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf65">
<mml:math id="m81">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mrow>
<mml:mi mathvariant="bold-italic">m</mml:mi>
<mml:mi mathvariant="bold-italic">e</mml:mi>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> for the left parotid when <bold>(A)</bold> increasing and <bold>(B)</bold> decreasing the dose objective. Note that each distribution is highly Gaussian. Distributions of the change in the median and mean doses for the left parotid when <bold>(C)</bold> increasing and <bold>(D)</bold> decreasing the volume objective. The distributions when changing the volume objective are not strictly Gaussian and show dependence on another variable.</p>
</caption>
<graphic xlink:href="fphy-12-1331849-g001.tif"/>
</fig>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>The left panels are the distribution of the change in the median and mean doses when the volume objective is changed. They are separated into the first 20%, last 20%, and all transitions. The first 20% transitions show a higher response in the change in median dose. The 2D plots of the right panel are of the change in the median dose with the initial volume objective. <inline-formula id="inf66">
<mml:math id="m82">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3c1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> gives the Spearman rank correlation coefficient between the two variables.</p>
</caption>
<graphic xlink:href="fphy-12-1331849-g002.tif"/>
</fig>
<p>Regarding checking the Markov property of the system, non-negligible correlations were found between the change in the state and past states when increasing or decreasing the volume objective. However, this was shown to be from the correlation of the current state and the past states. The current position of the volume objective was very highly correlated and, in some cases, completely determined by the past few states. No extra correlation was present between the transition and past states beyond the correlation between the states themselves.</p>
</sec>
<sec id="s3-2">
<title>3.2 Model analysis and validation</title>
<p>The first model (model 1) was successfully trained on the training set and tested on the testing set. In this model, the agent could only change the dose objective. Once trained, the agent was able to successfully plan for the given goals on the testing set. For each organ at risk goal (left parotid, right parotid, and oral cavity), the agent successfully acted to lower the median dose of each in an efficient manner. All of this was accomplished while maintaining PTV coverage. The agent was also observed to switch back and forth between OARs during planning. This is shown in <xref ref-type="fig" rid="F3">Figure 3</xref> as the agent planned for each goal simultaneously balancing against the need to cover the PTV. This contrasts with the planning done by purely selecting random actions where no goal was met. <xref ref-type="fig" rid="F3">Figure 3</xref> also shows that for the vast majority of plans, the agent was able to fully reduce the individual costs for the median doses connected to the OARs, while the random actions were not capable of accomplishing this. Thus, the agent learned to plan accordingly to the given actions. An inherent bias was observed in model 1 when trained only using a 15-case set. This was seen by the agent preferring to spare the left parotid over the right parotid. The bias can be seen in the differing weighting matrices. The state&#x2013;action pair for lowering the left parotid objective and the current dose of the left parotid was much higher than that for lowering the right parotid objective and the current dose of the right parotid. This is shown in <xref ref-type="fig" rid="F4">Figure 4</xref> along with the magnitude of the corresponding state element&#x2013;action pairs throughout training between the small and large training sets. The decision boundary slope between the state&#x2013;action pairs of lowering the dose and the current dose of the parotids can be projected onto the dimensions of the state representing the dose of the left and right parotids. The slope of this projection would describe the bias between the two and is plotted in <xref ref-type="fig" rid="F4">Figure 4</xref> as well.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Each plot gives the relative cost, with an initial cost of 1, at each action step for model 1&#x2019;s validation set. The solid lines are the mean, and the shaded region is the standard deviation for all validation sets. The (blue) agent consistently reduces the cost for all organs at risk while maintaining the target coverage. The (green) random agent is given to show the improbability of making the correct choices made by the agent.</p>
</caption>
<graphic xlink:href="fphy-12-1331849-g003.tif"/>
</fig>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>
<bold>(A)</bold> This is a plot of the magnitudes of the state&#x2013;action pairs in the weighting matrix throughout training. Each value corresponds to the magnitude of the pair corresponding to the action of lowering the dose for an organ and the state of the dose of that organ. <bold>(B)</bold> This is a plot of the slope of the decision boundary of the hyperplane between lowering the left parotid and the right parotid projected onto the portion of the state with the dose of the left and right parotids. In this configuration, a slope of 1 would indicate a completely unbiased decision boundary. <bold>(C)</bold> The bottom is a portion of the weighting matrices of the Q-function for the small dataset (biased) and large dataset (unbiased). The corresponding values highlight the asymmetry and symmetry between sparing the left and right parotids as well as the secondary importance of the oral cavity determined by the agent.</p>
</caption>
<graphic xlink:href="fphy-12-1331849-g004.tif"/>
</fig>
<p>Interestingly, the agent learned to spare the oral cavity only secondarily to both parotids, even though equal weighting was given in the reward function. This can be seen in the decision boundaries. The region in which sparing of the oral cavity is preferred is much smaller than that for the parotids. The region&#x2019;s size is dependent on the current dose of the oral cavity and grows linearly with it. The decision boundaries between the three organs are shown in <xref ref-type="fig" rid="F5">Figure 5</xref>. The fact that the agent learned to spare the oral cavity secondary to the parotids is most likely due to the relative difficulty of reaching the goals between the organs. The oral cavity&#x2019;s goal is normally much easier to reach than with the parotids. Thus, although the weighting in the reward function is the same, the parotids experience much higher rewards early on as they lie further away from the goal.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>
<bold>(A)</bold> This is a projection of the decision surface onto the right and left parotid mean dose portions of the state. It can be seen that there is little bias between the two from the slope of the boundary between lowering the objective on the left and right parotid. If the dose of the parotids is below some limit, then from the Q-function, the agent will want to increase the dose. With the dose of both parotids close to the goal, the space for lowering the oral cavity objective is activated. <bold>(B)</bold> The bottom two plots are projections of the decision surface onto the oral cavity mean dose and the left (bottom left) and right (bottom right) parotid mean doses.</p>
</caption>
<graphic xlink:href="fphy-12-1331849-g005.tif"/>
</fig>
<p>Model 2 was also successfully trained and implemented. Analysis of the resulting weighting matrix showed a strategy in which there is a trading off between lowering the objective volume and dose for the parotid glands. This trade-off is a function of the current position of the objectives and the current mean dose. The weighting matrix and the section of the decision boundary for model 2 are shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. The observed strategy learned from the agent was to lower the dose of the parotids by a trade-off between lowering the objective volume and dose given the decision boundaries between the two. What was observed for the oral cavity was not only reducing its dose of secondary importance but that it was primarily achieved by lowering the dose objective only. When comparing the validated plans to plans produced simply by placing the template objectives, model 2 outperformed the template plans for both parotids, with an average reduction in the mean dose of 7&#xa0;Gy <inline-formula id="inf67">
<mml:math id="m83">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 2.5&#xa0;Gy. This was compared to a 4&#xa0;Gy <inline-formula id="inf68">
<mml:math id="m84">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
<mml:mn>1.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> Gy improvement by model 1. For both model 1 and model 2, the improvement in the oral cavity was minimal and approximately 1% on average. Model 2 displayed essentially no improvement over model 1 when comparing the mean dose of the oral cavity. This is in alignment with the priority of adjusting the dose objective over the volume objective.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>
<bold>(A)</bold> This is a plot of the decision boundary between either lowering the objective volume or dose as a function of the current objective volume and dose. Initially, the agent will lower the volume until reaching the trade-off boundary and then begin lowering the dose until it hits the boundary again. This boundary is dynamic and moves to the right with the increasing mean dose. <bold>(B)</bold> Sections of the weighting matrix important to the parotids and oral cavity are zoomed in. The symmetry between the parotids can be seen. Not only can the reduced significance of the oral cavity be observed but also that the preference is to lower the objective dose and not the objective volume.</p>
</caption>
<graphic xlink:href="fphy-12-1331849-g006.tif"/>
</fig>
<p>Model 2 produced plans with distributions that are highly comparable with those of the clinical plans. For comparisons with clinical plans, all plans were normalized such that PTV coverage was the same with <inline-formula id="inf69">
<mml:math id="m85">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mrow>
<mml:mn>95</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>44</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> Gy. In the first case where model 2 was used with no state scaling for plan-specific goals, it produced slightly better plans than the clinical plans for the primary plans at 44&#xa0;Gy with <inline-formula id="inf70">
<mml:math id="m86">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.01</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. The difference was mainly within single-side sparing cases in the sparing of the parotid with very high mean doses seen in the clinical plan. For the boost PTV plans at 26&#xa0;Gy, model 2 produced plans within 1&#xa0;Gy of the mean doses from the clinical plans for both parotids and the oral cavity with <inline-formula id="inf71">
<mml:math id="m87">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0.83</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> for all three OARs. Combining the primary and boosted plans resulted in very comparable sum plans with the clinical sum plans, with an overall composite <inline-formula id="inf72">
<mml:math id="m88">
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-value of 0.07. When the dosimetric portion of the state was scaled to account for plan-specific goals, model 2 produced plans highly similar to the clinical plans. With all plans normalized such that D<sub>95%</sub> for the PTV was 44&#xa0;Gy, the plans produced by the agent tended to have slightly higher hotspots than the clinical plans. D<sub>max</sub> for the clinical plans averaged approximately to 49&#xa0;Gy, while the plans produced by model 2 averaged approximately to 51&#xa0;Gy (<inline-formula id="inf73">
<mml:math id="m89">
<mml:mrow>
<mml:mo>&#xb1;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>1&#xa0;Gy). The sum plan dosimetry for model 2 is compared to the clinical sum plans shown in <xref ref-type="fig" rid="F7">Figure 7</xref> without plan specific goals and <xref ref-type="fig" rid="F8">Figure 8</xref> with plan-specific goals. All the statistical analyses are summarized in <xref ref-type="table" rid="T1">Table 1</xref>. The RL planning agent was able to produce these plans used in validation in an average of 13.58&#xa0;min with a minimum and a maximum planning time of 2.27 and 44.82&#xa0;min, respectively.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Each plot is a scatter plot for the dosimetric endpoints where each point is a specific plan between model 2 and the clinical plans for the <bold>(A)</bold> primary plan at 44&#xa0;Gy, <bold>(B</bold>) boost plan at 26&#xa0;Gy, and <bold>(C)</bold> plan sum.</p>
</caption>
<graphic xlink:href="fphy-12-1331849-g007.tif"/>
</fig>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Each plot is a scatter plot for the dosimetric endpoints where each point is a specific plan between model 2 with state scaling and the clinical plans for the <bold>(A)</bold> primary plan at 44&#xa0;Gy, <bold>(B)</bold> boost plan at 26&#xa0;Gy, and <bold>(C)</bold> plan sum.</p>
</caption>
<graphic xlink:href="fphy-12-1331849-g008.tif"/>
</fig>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Statistical analysis between models, template plans, and clinical plans.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">
<inline-formula id="inf74">
<mml:math id="m90">
<mml:mrow>
<mml:mo>&#x2206;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>v</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">Model 1 vs template</th>
<th align="center">Model 2 vs template</th>
<th align="center">Model 2 vs model 1</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Left parotid</td>
<td align="center">&#x2212;4.00 Gy (&#x3c;0.01)</td>
<td align="center">&#x2212;6.96 Gy (&#x3c;0.01)</td>
<td align="center">&#x2212;2.96 Gy (&#x3c;0.01)</td>
</tr>
<tr>
<td align="left">Right parotid</td>
<td align="center">&#x2212;4.11 Gy (&#x3c;0.01)</td>
<td align="center">&#x2212;7.14 Gy (&#x3c;0.01)</td>
<td align="center">&#x2212;3.03 Gy (&#x3c;0.01)</td>
</tr>
<tr>
<td align="left">Oral cavity</td>
<td align="center">0.26 Gy (0.9)</td>
<td align="center">0.10 Gy (0.97)</td>
<td align="center">&#x2212;0.17 Gy (0.94)</td>
</tr>
<tr>
<td align="left">Total plan</td>
<td align="center">(&#x3c;0.01)</td>
<td align="center">(&#x3c;0.01)</td>
<td align="center">(0.041)</td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr>
<th align="center"/>
<th colspan="3" align="center">Model 2 vs clinical</th>
</tr>
<tr>
<th align="left">
<inline-formula id="inf75">
<mml:math id="m91">
<mml:mrow>
<mml:mo>&#x2206;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>v</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">Primary (44 Gy)</th>
<th align="center">Boost (26 Gy)</th>
<th align="center">Plan sum (70 Gy)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">Left parotid</td>
<td align="center">&#x2212;5.40 Gy (&#x3c;0.01)</td>
<td align="center">&#x2212;0.98 Gy (0.5)</td>
<td align="center">&#x2212;6.30 Gy (0.02)</td>
</tr>
<tr>
<td align="left">Right parotid</td>
<td align="center">&#x2212;5.79 Gy (&#x3c;0.01)</td>
<td align="center">&#x2212;0.42 Gy (0.7)</td>
<td align="center">&#x2212;6.08 Gy (0.02)</td>
</tr>
<tr>
<td align="left">Oral cavity</td>
<td align="center">&#x2212;0.29 Gy (0.9)</td>
<td align="center">0.85 Gy (0.6)</td>
<td align="center">&#x2212;0.67 Gy (0.87)</td>
</tr>
<tr>
<td align="left">Total plan</td>
<td align="center">(&#x3c;0.01)</td>
<td align="center">(0.83)</td>
<td align="center">(0.07)</td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr>
<th align="center"/>
<th colspan="3" align="center">Model 2&#x2a; vs clinical</th>
</tr>
<tr>
<th align="left">
<inline-formula id="inf76">
<mml:math id="m92">
<mml:mrow>
<mml:mo>&#x2206;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>v</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">Primary (44 Gy)</th>
<th align="center">Boost (26 Gy)</th>
<th align="center">Plan sum (70 Gy)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left">Left parotid</td>
<td align="center">&#x2212;1.38 Gy (0.36)</td>
<td align="center">4.16 Gy (0.48)</td>
<td align="center">&#x2212;2.61 Gy (0.38)</td>
</tr>
<tr>
<td align="left">Right parotid</td>
<td align="center">&#x2212;1.70 Gy (0.28)</td>
<td align="center">&#x2212;0.78 Gy (0.58)</td>
<td align="center">&#x2212;2.51 Gy (0.36)</td>
</tr>
<tr>
<td align="left">Oral cavity</td>
<td align="center">0.41 Gy (0.87)</td>
<td align="center">0.91 Gy (0.6)</td>
<td align="center">1.4 Gy (0.74)</td>
</tr>
<tr>
<td align="left">Total plan</td>
<td align="center">(0.43)</td>
<td align="center">(0.72)</td>
<td align="center">(0.67)</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The first value is the mean difference between the two, with the second being the <italic>p</italic>-value. A negative number indicates the first listed was less than the second, and in the cases with model 2 against the clinical, a negative value indicates that model 2 was lower on average. Model 2&#x2a; indicates that the plan states were scaled to plan-specific goals while model 2 uses static, plan-independent goals. A <italic>p</italic>-value less than 0.05 indicates a statistical difference, while any values greater than 0.05 show no statistical difference between the two datasets. All plans were normalized such that PTV coverage was the same with <inline-formula id="inf77">
<mml:math id="m93">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">D</mml:mi>
<mml:mrow>
<mml:mn mathvariant="bold">95</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn mathvariant="bold">44</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> Gy.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>With comparable dosimetry endpoints, model 2 produced slightly different DVH shapes compared to the clinical plans. The clinical plans had a sharper PTV DVH slope from a dose range of 95% to 105%, with fewer hotspots. The oral cavity DVH shapes were very similar between the two plans. For both parotids, even with similar final mean doses, the DVHs had noticeably different shapes in many cases. Typically, the clinical case DVHs were higher in the lower-dose regions and lower in the high-dose regions than those for model 2. Cross-over points often happened between 40% and 50% of the volume. For model 2, PTV coverage had comparable dose decreases to clinical plans, in the range from 90% to 50% dose. Model 2 produced less sparing of organs not included in the model like the spinal cord, larynx, and pharynx. These were included in optimization but with static plan-independent objectives. These static objectives may or may not reflect the optimal sparing of these organs and thus would lead to the observed discrepancies. Model 2 also had stronger normal tissue sparing as these were also not manipulated by the agent. A human planner may make the decision to sacrifice some normal tissue sparing, but the agent currently cannot make that decision. Some examples of the dose distribution are shown in <xref ref-type="fig" rid="F9">Figure 9</xref> and examples of the DVHs in <xref ref-type="fig" rid="F10">Figure 10</xref>.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>
<bold>(A)</bold> Clinical vs. <bold>(B)</bold> agent-delivered dose distributions.</p>
</caption>
<graphic xlink:href="fphy-12-1331849-g009.tif"/>
</fig>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Selection of DVHs comparing the plans produced by model 2 to corresponding clinical plans. The solid lines represent the plans created by the auto-planning agent, while the dashed line represents the corresponding clinical plan. RX notes that the agent was using plan-specific goals.</p>
</caption>
<graphic xlink:href="fphy-12-1331849-g010.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<p>Overall, both models 1 and 2 showed significant steps toward the goal of producing an overall auto-planning agent. The models presented satisfy all conditions necessary in an MDP and provided a meaningful environment for agent learning. This is an important component to consider when developing an RL agent. Most of the models can take weeks to train, and increasing the size of the model will exponentially increase that time. Knowing the consistency of the environment is crucial with a lag time this large in between results.</p>
<p>It is not surprising that model 1 failed to plan for the mean dose after successfully planning for the median dose. The median dose is directly linked to a specific dose&#x2013;volume objective, namely, the dose at 50% volume. So the response of this goal will be large when changing the specific objective as had been seen with the transition probabilities. What was also seen with the transition probabilities is that the mean dose had a much smaller response and thus would need more movement to completely reduce it to the desired amount. Thus, the movement in the volume space expanded the desired total movement amount. Hence, allowing the agent to move in the dose and volume space greatly improved the agent&#x2019;s planning ability. Moving the objective diagonally in the dose&#x2013;volume space will reduce the area under the curve more effectively than simply moving it in the dose space. It was also apparent from the strong correlation of the transition probabilities to the current location of the objective that including this information into the state function is crucial to provide the agent with as much needed information as possible.</p>
<p>Model 2 showed very promising results when compared to the clinical plans. In both scenarios of including and not including plan-specific goals, the agent produced statistically similar plans to that of those used in the clinic. This included producing very comparable and acceptable dose distributions. This is quite promising as the agent created these plans in a matter of minutes without human intervention. It should be noted though that only the three organs mentioned were included. For a fully automated planning agent, all organs would need to be considered and more objective control points may need to be added to the PTV to better control the coverage/hotspot trade-off. This can be built upon the framework presented.</p>
<p>Another interesting result was that of discovering the model bias that seemed to be dependent on the training set size. No apparent bias was found in the smaller training set when anatomical and geometric values were investigated. However, the small bias inherent to the set was exacerbated when using a small training set. This resulted in a slightly different final dosimetry as the biased organs had greater sparing. This may not be important for some cases, but if the agent is not able to get the preferred organ below a certain point, it could spend the entire planning time on it without considering the others. Currently, the agent has no way of giving up on a goal, and this could be an interesting avenue for future work.</p>
<p>Another limitation is the lack of heterogeneity correction in the dose calculation and optimization model. This is not a big difference for the sites studied here but could potentially pose issues when dealing with lung cases. The large air or vapor areas in the lungs can drastically affect photon and electron transport as compared to normal tissue or even bone. Thus, in these instances, larger differences due to heterogeneities could be present.</p>
<p>Computational cost is also a potential limitation to this and future studies. Model 1 exceeded 10,000 training iterations with only one objective per the three OARs, and model 2 reached over 24,000 training iterations. Therefore, using the current computation setup, it would be difficult to include multiple control points for multiple organs as well as more PTV objectives to ensure a sharp DVH for the target. Since the RL problem is iterative and is based on Markov decision chains, large-scale parallelization is not an option. The needed speed-up would need to be in the optimization step. With so many optimization steps, a small reduction in cost would potentially lead to a very large reduction for the entire training process.</p>
<p>Even given the methods mentioned in the introduction that can predict the achievable DVH or dosimetric endpoints for an OAR given the patient anatomy, it would be more complete to remove the goal as input and have it inferred by the agent. This would allow more flexibility and remove a prior step. To accomplish this goal, anatomical features would need to be included into the state function. It has already been shown that certain features are good predictors for the final achievable median dose [<xref ref-type="bibr" rid="B9">9</xref>]. These include the median distance from target, the overlap percentage between the organ and the target, and the total volume within an organ specific range. These would be simple factors to add into the state function to allow the agent spatial and anatomical information to adjudicate the goal for each of the glands.</p>
<p>There are few other limitations to this study that further work can improve on. The first is that a larger dataset from multiple institutions could be used. The reason for this is to incorporate a larger and more diverse patient population and include institutional differences in both the training and evaluation of the model. Another is a study on the selection of hyper-parameters. The long training time for the models limits the ability to tune hyper-parameters, and thus more work is needed in selecting these. Finally, this study also incorporates a discretized action space, when in reality a human planner can change the objectives by any real value and has control over all regions of interest. The addition of regions of interest must be done carefully in order to reduce the computational cost.</p>
<p>The SARSA algorithm presented is very simple. However, it has been shown to be powerful. The presented model architecture provides a very solid foundation with the ability to interpret the learning of the agent. These methods rely on relatively small datasets and provide the potential of moving into more deep learning methodologies as with increase in understanding.</p>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>An RL model was developed and tested for the purposes of creating IMRT plans for HN cancer treatment. The proposed models were based on including dosimetric and objective information in the state function and were shown to perform in a Markovian fashion that well-approximates the conditions required by an RL model. The proposed model was shown to make significant improvements over template plans, creating plans that were statistically similar to clinical plans and made in a fraction of the time. The methods and results presented here have shown that RL can be used to develop efficient IMRT planning agents that automatically create clinically acceptable plans in a matter of minutes. This will allow not only for building upon this model for HN cancers but for other treatment sites as well.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The raw data supporting the conclusion of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s7">
<title>Author contributions</title>
<p>HS: conceptualization, data curation, formal analysis, investigation, methodology, software, validation, visualization, and writing&#x2013;original draft. XL: data curation, software, and writing&#x2013;review and editing. YS: supervision and writing&#x2013;review and editing. QW: conceptualization, methodology, project administration, resources, supervision, and writing&#x2013;review and editing. YG: conceptualization, supervision, and writing&#x2013;review and editing. QJW: conceptualization, funding acquisition, methodology, project administration, resources, supervision, and writing&#x2013;review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The authors declare that financial support was received for the research, authorship, and/or publication of this article. This work was supported by an NIH Grant (&#x23;R01CA201212) as well as a Varian research grant.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
<p>The authors declared that they were an editorial board member of Frontiers, at the time of submission. This had no impact on the peer review process and the final decision.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dawes</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Wood</surname>
<given-names>CM</given-names>
</name>
</person-group>. <article-title>The contribution of oral minor mucous gland secretions to the volume of whole saliva in man</article-title>. <source>Arch Oral Biol</source> (<year>1973</year>) <volume>18</volume>(<issue>3</issue>):<fpage>337</fpage>&#x2013;<lpage>42</lpage>. <pub-id pub-id-type="doi">10.1016/0003-9969(73)90156-8</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Deasy</surname>
<given-names>JO</given-names>
</name>
<name>
<surname>Moiseenko</surname>
<given-names>V</given-names>
</name>
<name>
<surname>Marks</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Chao</surname>
<given-names>KSC</given-names>
</name>
<name>
<surname>Nam</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Eisbruch</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>Radiotherapy dose&#x2013;volume effects on salivary gland function</article-title>. <source>Int J Radiat Oncol Biol Phys</source> (<year>2010</year>) <volume>76</volume>(<issue>3</issue>):<fpage>S58</fpage>&#x2013;<lpage>63</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijrobp.2009.06.090</pub-id>
</citation>
</ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Patrik Brodin</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Tom&#xe9;</surname>
<given-names>WA</given-names>
</name>
</person-group>. <article-title>Revisiting the dose constraints for head and neck OARs in the current era of IMRT</article-title>. <source>Oral Oncol</source> (<year>2018</year>) <volume>86</volume>:<fpage>8</fpage>&#x2013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1016/j.oraloncology.2018.08.018</pub-id>
</citation>
</ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>ZH</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>SZ</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>ZY</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>CP</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>HS</given-names>
</name>
<name>
<surname>Tu</surname>
<given-names>WY</given-names>
</name>
<etal/>
</person-group> <article-title>Protecting the oral mucosa in patients with oral tongue squamous cell carcinoma treated postoperatively with intensity-modulated radiotherapy: a randomized study</article-title>. <source>The Laryngoscope</source> (<year>2012</year>) <volume>122</volume>(<issue>2</issue>):<fpage>291</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1002/lary.22434</pub-id>
</citation>
</ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Puri</surname>
<given-names>DR</given-names>
</name>
<name>
<surname>Blanco</surname>
<given-names>AI</given-names>
</name>
<name>
<surname>Chao</surname>
<given-names>KSC</given-names>
</name>
</person-group>. <article-title>Intensity-modulated radiation therapy in head and neck cancers: an update</article-title>. <source>Head Neck</source> (<year>2007</year>) <volume>29</volume>(<issue>4</issue>):<fpage>387</fpage>&#x2013;<lpage>400</lpage>. <pub-id pub-id-type="doi">10.1002/hed.20332</pub-id>
</citation>
</ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gupta</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Agarwal</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Jain</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Phurailatpam</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Kannan</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Ghosh-Laskar</surname>
<given-names>S</given-names>
</name>
<etal/>
</person-group> <article-title>Three-dimensional conformal radiotherapy (3D-CRT) versus intensity modulated radiation therapy (IMRT) in squamous cell carcinoma of the head and neck: a randomized controlled trial</article-title>. <source>Radiother Oncol J Eur Soc Ther Radiol Oncol</source> (<year>2012</year>) <volume>104</volume>(<issue>3</issue>):<fpage>343</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1016/j.radonc.2012.07.001</pub-id>
</citation>
</ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hunt</surname>
<given-names>MA</given-names>
</name>
<name>
<surname>Jackson</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Narayana</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>N</given-names>
</name>
</person-group>. <article-title>Geometric factors influencing dosimetric sparing of the parotid glands using IMRT</article-title>. <source>Int J Radiat Oncol Biol Phys</source> (<year>2006</year>) <volume>66</volume>(<issue>1</issue>):<fpage>296</fpage>&#x2013;<lpage>304</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijrobp.2006.05.028</pub-id>
</citation>
</ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Anand</surname>
<given-names>AK</given-names>
</name>
<name>
<surname>Jain</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Negi</surname>
<given-names>PS</given-names>
</name>
<name>
<surname>Chaudhoory</surname>
<given-names>AR</given-names>
</name>
<name>
<surname>Sinha</surname>
<given-names>SN</given-names>
</name>
<name>
<surname>Choudhury</surname>
<given-names>PS</given-names>
</name>
<etal/>
</person-group> <article-title>Can dose reduction to one parotid gland prevent xerostomia? A feasibility study for locally advanced head and neck cancer patients treated with intensity-modulated radiotherapy</article-title>. <source>Clin Oncol R Coll Radiol G B</source> (<year>2006</year>) <volume>18</volume>(<issue>6</issue>):<fpage>497</fpage>&#x2013;<lpage>504</lpage>. <pub-id pub-id-type="doi">10.1016/j.clon.2006.04.014</pub-id>
</citation>
</ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yuan</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Ge</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>WR</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>FF</given-names>
</name>
<name>
<surname>Kirkpatrick</surname>
<given-names>JP</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>QJ</given-names>
</name>
</person-group>. <article-title>Quantitative analysis of the factors which affect the interpatient organ-at-risk dose sparing variation in IMRT plans</article-title>. <source>Med Phys</source> (<year>2012</year>) <volume>39</volume>(<issue>11</issue>):<fpage>6868</fpage>&#x2013;<lpage>78</lpage>. <pub-id pub-id-type="doi">10.1118/1.4757927</pub-id>
</citation>
</ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Ricchetti</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Sanguineti</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Kazhdan</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Simari</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Chuang</surname>
<given-names>M</given-names>
</name>
<etal/>
</person-group> <article-title>Patient geometry-driven information retrieval for IMRT treatment plan quality control</article-title>. <source>Med Phys</source> (<year>2009</year>) <volume>36</volume>(<issue>12</issue>):<fpage>5497</fpage>&#x2013;<lpage>505</lpage>. <pub-id pub-id-type="doi">10.1118/1.3253464</pub-id>
</citation>
</ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Moore</surname>
<given-names>KL</given-names>
</name>
<name>
<surname>Brame</surname>
<given-names>RS</given-names>
</name>
<name>
<surname>Low</surname>
<given-names>DA</given-names>
</name>
<name>
<surname>Mutic</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Experience-based quality control of clinical intensity-modulated radiotherapy planning</article-title>. <source>Int J Radiat Oncol Biol Phys</source> (<year>2011</year>) <volume>81</volume>(<issue>2</issue>):<fpage>545</fpage>&#x2013;<lpage>51</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijrobp.2010.11.030</pub-id>
</citation>
</ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Ge</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Thongphiew</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>FF</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>QJ</given-names>
</name>
</person-group>. <article-title>A planning quality evaluation tool for prostate adaptive IMRT based on machine learning</article-title>. <source>Med Phys</source> (<year>2011</year>) <volume>38</volume>(<issue>2</issue>):<fpage>719</fpage>&#x2013;<lpage>26</lpage>. <pub-id pub-id-type="doi">10.1118/1.3539749</pub-id>
</citation>
</ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Appenzoller</surname>
<given-names>LM</given-names>
</name>
<name>
<surname>Michalski</surname>
<given-names>JM</given-names>
</name>
<name>
<surname>Thorstad</surname>
<given-names>WL</given-names>
</name>
<name>
<surname>Mutic</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Moore</surname>
<given-names>KL</given-names>
</name>
</person-group>. <article-title>Predicting dose-volume histograms for organs-at-risk in IMRT planning</article-title>. <source>Med Phys</source> (<year>2012</year>) <volume>39</volume>(<issue>12</issue>):<fpage>7446</fpage>&#x2013;<lpage>61</lpage>. <pub-id pub-id-type="doi">10.1118/1.4761864</pub-id>
</citation>
</ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yuan</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>QJ</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>FF</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Ge</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Incorporating single-side sparing in models for predicting parotid dose sparing in head and neck IMRT</article-title>. <source>Med Phys</source> (<year>2014</year>) <volume>41</volume>(<issue>2</issue>):<fpage>021728</fpage>. <pub-id pub-id-type="doi">10.1118/1.4862075</pub-id>
</citation>
</ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kuo</surname>
<given-names>YH</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>JA</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>TC</given-names>
</name>
<name>
<surname>Juan</surname>
<given-names>CJ</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>CC</given-names>
</name>
<name>
<surname>Chien</surname>
<given-names>CR</given-names>
</name>
</person-group>. <article-title>Comparative effectiveness of simultaneous integrated boost vs sequential intensity-modulated radiotherapy for oropharyngeal or hypopharyngeal cancer patients</article-title>. <source>Medicine (Baltimore)</source> (<year>2019</year>) <volume>98</volume>(<issue>51</issue>):<fpage>e18474</fpage>. <pub-id pub-id-type="doi">10.1097/md.0000000000018474</pub-id>
</citation>
</ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nelms</surname>
<given-names>BE</given-names>
</name>
<name>
<surname>Robinson</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Markham</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Velasco</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Boyd</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Narayan</surname>
<given-names>S</given-names>
</name>
<etal/>
</person-group> <article-title>Variation in external beam treatment plan quality: an inter-institutional study of planners and planning systems</article-title>. <source>Pract Radiat Oncol</source> (<year>2012</year>) <volume>2</volume>(<issue>4</issue>):<fpage>296</fpage>&#x2013;<lpage>305</lpage>. <pub-id pub-id-type="doi">10.1016/j.prro.2011.11.012</pub-id>
</citation>
</ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sheng</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ge</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Stephens</surname>
<given-names>H</given-names>
</name>
<etal/>
</person-group> <article-title>Artificial intelligence applications in intensity modulated radiation treatment planning: an overview</article-title>. <source>Quant Imaging Med Surg</source> (<year>2021</year>) <volume>11</volume>(<issue>12</issue>):<fpage>4859</fpage>&#x2013;<lpage>80</lpage>. <pub-id pub-id-type="doi">10.21037/qims-21-208</pub-id>
</citation>
</ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kubo</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Monzen</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Ishii</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Tamura</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Kawamorita</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Sumida</surname>
<given-names>I</given-names>
</name>
<etal/>
</person-group> <article-title>Dosimetric comparison of RapidPlan and manually optimized plans in volumetric modulated arc therapy for prostate cancer</article-title>. <source>Phys Med PM Int J Devoted Appl Phys Med Biol Off J Ital Assoc Biomed Phys AIFB</source> (<year>2017</year>) <volume>44</volume>:<fpage>199</fpage>&#x2013;<lpage>204</lpage>. <pub-id pub-id-type="doi">10.1016/j.ejmp.2017.06.026</pub-id>
</citation>
</ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Carmona</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Sirak</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Kasaova</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Followill</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Michalski</surname>
<given-names>J</given-names>
</name>
<etal/>
</person-group> <article-title>Highly efficient training, refinement, and validation of a knowledge-based planning quality-control system for radiation therapy clinical trials</article-title>. <source>Int J Radiat Oncol Biol Phys</source> (<year>2017</year>) <volume>97</volume>(<issue>1</issue>):<fpage>164</fpage>&#x2013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijrobp.2016.10.005</pub-id>
</citation>
</ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Scaggion</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Fusella</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Roggio</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Bacco</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Pivato</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Rossato</surname>
<given-names>MA</given-names>
</name>
<etal/>
</person-group> <article-title>Reducing inter- and intra-planner variability in radiotherapy plan output with a commercial knowledge-based planning solution</article-title>. <source>Phys Med PM Int J Devoted Appl Phys Med Biol Off J Ital Assoc Biomed Phys AIFB</source> (<year>2018</year>) <volume>53</volume>:<fpage>86</fpage>&#x2013;<lpage>93</lpage>. <pub-id pub-id-type="doi">10.1016/j.ejmp.2018.08.016</pub-id>
</citation>
</ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hussein</surname>
<given-names>M</given-names>
</name>
<name>
<surname>South</surname>
<given-names>CP</given-names>
</name>
<name>
<surname>Barry</surname>
<given-names>MA</given-names>
</name>
<name>
<surname>Adams</surname>
<given-names>EJ</given-names>
</name>
<name>
<surname>Jordan</surname>
<given-names>TJ</given-names>
</name>
<name>
<surname>Stewart</surname>
<given-names>AJ</given-names>
</name>
<etal/>
</person-group> <article-title>Clinical validation and benchmarking of knowledge-based IMRT and VMAT treatment planning in pelvic anatomy</article-title>. <source>Radiother Oncol J Eur Soc Ther Radiol Oncol</source> (<year>2016</year>) <volume>120</volume>(<issue>3</issue>):<fpage>473</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1016/j.radonc.2016.06.022</pub-id>
</citation>
</ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tol</surname>
<given-names>JP</given-names>
</name>
<name>
<surname>Delaney</surname>
<given-names>AR</given-names>
</name>
<name>
<surname>Dahele</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Slotman</surname>
<given-names>BJ</given-names>
</name>
<name>
<surname>Verbakel</surname>
<given-names>WFAR</given-names>
</name>
</person-group>. <article-title>Evaluation of a knowledge-based planning solution for head and neck cancer</article-title>. <source>Int J Radiat Oncol Biol Phys</source> (<year>2015</year>) <volume>91</volume>(<issue>3</issue>):<fpage>612</fpage>&#x2013;<lpage>20</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijrobp.2014.11.014</pub-id>
</citation>
</ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chang</surname>
<given-names>ATY</given-names>
</name>
<name>
<surname>Hung</surname>
<given-names>AWM</given-names>
</name>
<name>
<surname>Cheung</surname>
<given-names>FWK</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>MCH</given-names>
</name>
<name>
<surname>Chan</surname>
<given-names>OSH</given-names>
</name>
<name>
<surname>Philips</surname>
<given-names>H</given-names>
</name>
<etal/>
</person-group> <article-title>Comparison of planning quality and efficiency between conventional and knowledge-based algorithms in nasopharyngeal cancer patients using intensity modulated radiation therapy</article-title>. <source>Int J Radiat Oncol Biol Phys</source> (<year>2016</year>) <volume>95</volume>(<issue>3</issue>):<fpage>981</fpage>&#x2013;<lpage>90</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijrobp.2016.02.017</pub-id>
</citation>
</ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shen</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Gonzalez</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>McBeth</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>N</given-names>
</name>
<etal/>
</person-group> <article-title>Operating a treatment planning system using a deep-reinforcement learning-based virtual treatment planner for prostate cancer intensity-modulated radiation therapy treatment planning</article-title>. <source>Med Phys</source> (<year>2020</year>) <volume>47</volume>(<issue>6</issue>):<fpage>2329</fpage>&#x2013;<lpage>36</lpage>. <pub-id pub-id-type="doi">10.1002/mp.14114</pub-id>
</citation>
</ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sprouts</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Chi</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>The development of a deep reinforcement learning network for dose-volume-constrained treatment planning in prostate cancer intensity modulated radiotherapy</article-title>. <source>Biomed Phys Eng Express</source> (<year>2022</year>) <volume>8</volume>(<issue>4</issue>):<fpage>045008</fpage>. <pub-id pub-id-type="doi">10.1088/2057-1976/ac6d82</pub-id>
</citation>
</ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gao</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Kyun Park</surname>
<given-names>Y</given-names>
</name>
</person-group>. <article-title>Implementation and evaluation of an intelligent automatic treatment planning robot for prostate cancer stereotactic body radiation therapy</article-title>. <source>Radiother Oncol</source> (<year>2023</year>) <volume>184</volume>:<fpage>109685</fpage>. <pub-id pub-id-type="doi">10.1016/j.radonc.2023.109685</pub-id>
</citation>
</ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B</given-names>
</name>
</person-group>. <article-title>An integrated solution of deep reinforcement learning for automatic IMRT treatment planning in non-small-cell lung cancer</article-title>. <source>Front Oncol</source> (<year>2023</year>) <volume>13</volume>:<fpage>1124458</fpage>. <pub-id pub-id-type="doi">10.3389/fonc.2023.1124458</pub-id>
</citation>
</ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Sheng</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Palta</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Czito</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Willett</surname>
<given-names>C</given-names>
</name>
<etal/>
</person-group> <article-title>An interpretable planning bot for pancreas stereotactic body radiation therapy</article-title>. <source>Int J Radiat Oncol</source> (<year>2021</year>) <volume>109</volume>(<issue>4</issue>):<fpage>1076</fpage>&#x2013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijrobp.2020.10.019</pub-id>
</citation>
</ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Stephens</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>QJ</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Q</given-names>
</name>
</person-group>. <article-title>Introducing matrix sparsity with kernel truncation into dose calculations for fluence optimization</article-title>. <source>Biomed Phys Eng Express</source> (<year>2021</year>) <volume>8</volume>(<issue>1</issue>):<fpage>8</fpage>. <pub-id pub-id-type="doi">10.1088/2057-1976/ac35f8</pub-id>
</citation>
</ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Sutton</surname>
<given-names>RS</given-names>
</name>
<name>
<surname>Barto</surname>
<given-names>AG</given-names>
</name>
</person-group>. <article-title>Reinforcement learning: an introduction</article-title>. In: <person-group person-group-type="editor">
<name>
<surname>Bach</surname>
<given-names>F</given-names>
</name>
</person-group>, editor. <source>Adaptive computation and machine learning series</source>. <publisher-loc>Cambridge, MA, USA</publisher-loc>: <publisher-name>A Bradford Book</publisher-name> (<year>1998</year>). p. <fpage>344</fpage>.</citation>
</ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pedregosa</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Varoquaux</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Gramfort</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Michel</surname>
<given-names>V</given-names>
</name>
<name>
<surname>Thirion</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Grisel</surname>
<given-names>O</given-names>
</name>
<etal/>
</person-group> <article-title>Scikit-learn: machine learning in Python</article-title>. <source>J Mach Learn Res</source> (<year>2011</year>) <volume>12</volume>(<issue>85</issue>):<fpage>2825</fpage>&#x2013;<lpage>30</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1201.0490</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>