<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurosci.</journal-id>
<journal-title>Frontiers in Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurosci.</abbrev-journal-title>
<issn pub-type="epub">1662-453X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnins.2021.740353</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Stratification by Tumor Grade Groups in a Holistic Evaluation of Machine Learning for Brain Tumor Segmentation</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Prabhudesai</surname> <given-names>Snehal</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1285749/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname> <given-names>Nicholas Chandler</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x0002A;</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/776814/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Ahluwalia</surname> <given-names>Vinayak</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1436896/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Huan</surname> <given-names>Xun</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1191956/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Bapuraj</surname> <given-names>Jayapalli Rajiv</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
</contrib>
<contrib contrib-type="author">
<name><surname>Banovic</surname> <given-names>Nikola</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1415539/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Rao</surname> <given-names>Arvind</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<xref ref-type="aff" rid="aff7"><sup>7</sup></xref>
<xref ref-type="aff" rid="aff8"><sup>8</sup></xref>
<xref ref-type="corresp" rid="c003"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1177294/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Computer Science and Engineering, University of Michigan</institution>, <addr-line>Ann Arbor, MI</addr-line>, <country>United States</country></aff>
<aff id="aff2"><sup>2</sup><institution>Computational Medicine and Bioinformatics, Michigan Medicine</institution>, <addr-line>Ann Arbor, MI</addr-line>, <country>United States</country></aff>
<aff id="aff3"><sup>3</sup><institution>Perelman School of Medicine at the University of Pennsylvania</institution>, <addr-line>Philadelphia, PA</addr-line>, <country>United States</country></aff>
<aff id="aff4"><sup>4</sup><institution>Mechanical Engineering, University of Michigan</institution>, <addr-line>Ann Arbor, MI</addr-line>, <country>United States</country></aff>
<aff id="aff5"><sup>5</sup><institution>Department of Radiology, University of Michigan</institution>, <addr-line>Ann Arbor, MI</addr-line>, <country>United States</country></aff>
<aff id="aff6"><sup>6</sup><institution>Department of Biostatistics, University of Michigan</institution>, <addr-line>Ann Arbor, MI</addr-line>, <country>United States</country></aff>
<aff id="aff7"><sup>7</sup><institution>Department of Radiation Oncology, University of Michigan</institution>, <addr-line>Ann Arbor, MI</addr-line>, <country>United States</country></aff>
<aff id="aff8"><sup>8</sup><institution>Department of Biomedical Engineering, University of Michigan</institution>, <addr-line>Ann Arbor, MI</addr-line>, <country>United States</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Spyridon Bakas, University of Pennsylvania, United States</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: David Haynor, University of Washington, United States; Francesco Grussu, Vall d&#x00027;Hebron Institute of Oncology (VHIO), Spain; Zeina A. Shboul, Old Dominion University, United States</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Snehal Prabhudesai <email>snehalbp&#x00040;umich.edu</email></corresp>
<corresp id="c002">Nicholas Chandler Wang <email>ncwang&#x00040;med.umich.edu</email></corresp>
<corresp id="c003">Arvind Rao <email>ukarvind&#x00040;umich.edu</email></corresp>
<fn fn-type="other" id="fn001"><p>This article was submitted to Brain Imaging Methods, a section of the journal Frontiers in Neuroscience</p></fn>
<fn fn-type="equal" id="fn002"><p>&#x02020;These authors share first authorship</p></fn></author-notes>
<pub-date pub-type="epub">
<day>06</day>
<month>10</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>15</volume>
<elocation-id>740353</elocation-id>
<history>
<date date-type="received">
<day>12</day>
<month>07</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>01</day>
<month>09</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2021 Prabhudesai, Wang, Ahluwalia, Huan, Bapuraj, Banovic and Rao.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Prabhudesai, Wang, Ahluwalia, Huan, Bapuraj, Banovic and Rao</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license></permissions>
<abstract><p>Accurate and consistent segmentation plays an important role in the diagnosis, treatment planning, and monitoring of both High Grade Glioma (HGG), including Glioblastoma Multiforme (GBM), and Low Grade Glioma (LGG). Accuracy of segmentation can be affected by the imaging presentation of glioma, which greatly varies between the two tumor grade groups. In recent years, researchers have used Machine Learning (ML) to segment tumor rapidly and consistently, as compared to manual segmentation. However, existing ML validation relies heavily on computing summary statistics and rarely tests the generalizability of an algorithm on clinically heterogeneous data. In this work, our goal is to investigate how to holistically evaluate the performance of ML algorithms on a brain tumor segmentation task. We address the need for rigorous evaluation of ML algorithms and present four axes of model evaluation&#x02014;diagnostic performance, model confidence, robustness, and data quality. We perform a comprehensive evaluation of a glioma segmentation ML algorithm by stratifying data by specific tumor grade groups (GBM and LGG) and evaluate these algorithms on each of the four axes. The main takeaways of our work are&#x02014;(1) ML algorithms need to be evaluated on out-of-distribution data to assess generalizability, reflective of tumor heterogeneity. (2) Segmentation metrics alone are limited to evaluate the errors made by ML algorithms and their describe their consequences. (3) Adoption of tools in other domains such as robustness (adversarial attacks) and model uncertainty (prediction intervals) lead to a more comprehensive performance evaluation. Such a holistic evaluation framework could shed light on an algorithm&#x00027;s clinical utility and help it evolve into a more clinically valuable tool.</p></abstract>
<kwd-group>
<kwd>medical AI</kwd>
<kwd>evaluation</kwd>
<kwd>brain imaging</kwd>
<kwd>segmentation</kwd>
<kwd>GBM</kwd>
<kwd>LGG</kwd>
</kwd-group>
<contract-num rid="cn001">R37-CA214955</contract-num>
<contract-num rid="cn001">RSG-16-005-01</contract-num>
<contract-sponsor id="cn001">American Cancer Society<named-content content-type="fundref-id">10.13039/100000048</named-content></contract-sponsor>
<counts>
<fig-count count="12"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="66"/>
<page-count count="21"/>
<word-count count="10573"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1. Introduction</title>
<p>Accurate and consistent segmentation of gliomas (Chen et al., <xref ref-type="bibr" rid="B12">2017</xref>), is important for diagnosis, treatment planning, and post treatment evaluation. Glioblastoma Multiforme (GBM), the most aggressive of high grade gliomas, has the worst prognosis with a 5-year survival rate of &#x0003C;5% and a median survival of approximately a year even with treatment (Tamimi and Juweid, <xref ref-type="bibr" rid="B54">2017</xref>; Witthayanuwat et al., <xref ref-type="bibr" rid="B62">2018</xref>). Low grade gliomas (LGG), though less aggressive than GBM, reportedly undergo anaplastic progression into higher grade tumors around 70% of the time within 5&#x02013;10 years of diagnosis. The median survival from initial diagnosis is &#x0007E;7 years (Claus et al., <xref ref-type="bibr" rid="B14">2015</xref>).</p>
<p>Current standard of care for High Grade Glioma (HGG), for example GBM, is surgical resection of the tumor followed by radiotherapy combined with the chemotherapeutic agent temozolomide (Tan et al., <xref ref-type="bibr" rid="B55">2020</xref>). Segmentation for the surgical resection for gliomas should be effective for total gross resection or reduction in tumor bulk, without affecting the surrounding normal functional brain tissue. Radiation therapy requires accurate delineation of tumor margins to ensure effective dosage to tumor region. Due to the relative low aggressiveness of LGG, a more conservative management (&#x0201C;wait-and-watch&#x0201D;) approach (Whittle, <xref ref-type="bibr" rid="B61">2004</xref>) is sometimes adopted. Segmentation is important in this scenario also to monitor temporal morphological and volumetric alterations of the tumors during observation, prior to elective tumor resection (Larsen et al., <xref ref-type="bibr" rid="B29">2017</xref>).</p>
<p>However, the imaging presentation of gliomas varies between LGG and HGG, which could affect the accuracy of their segmentation. Most HGGs, such as GBMs, have a heterogeneous appearance on T1-weighted pre-contrast imaging and typically show a heterogeneous thick-walled rim-enhancing appearance on the T1 post-contrast (T1-Gd) sequence, with a surrounding low attenuation of perifocal edema. The overall appearance of HGGs on T2-weighted fluid-attenuated inversion recovery (FLAIR) sequence is heterogeneously hyperintense, with areas corresponding to enhancing and non-enhancing components as seen on T1-weighted post contrast sequence. The advancing non contrast-enhancing FLAIR hyperintense portions of the tumor are of concern to clinicians because it is believed to contain active tumor remote from the apparent enhanced portions of the aggressive core. On the other hand, low grade tumors appear hyperintense on a FLAIR sequence with or without clear margins. On the pre-contrast T1-weighted sequences, the lesions tend to be hypointense and typically do not enhance following administration of gadolinium based agents (Forst et al., <xref ref-type="bibr" rid="B18">2014</xref>; Bulakba&#x0015F;&#x00131; and Paksoy, <xref ref-type="bibr" rid="B8">2019</xref>).</p>
<p>Manually defining the margins of the tumor and surrounding non-enhancing perifocal region remains challenging due to tumor heterogeneity, ill-defined margins, and the varying degrees of perifocal edema. This makes segmentation an arduous task with questionable consistency. In recent years, Machine Learning (ML) techniques have shown potential to assist in tumor segmentation for correct diagnosis and efficient treatment planning (Wadhwa et al., <xref ref-type="bibr" rid="B59">2019</xref>; Bajaj and Chouhan, <xref ref-type="bibr" rid="B1">2020</xref>; Kocher et al., <xref ref-type="bibr" rid="B26">2020</xref>; Nazar et al., <xref ref-type="bibr" rid="B37">2020</xref>). While both HGG, including GBM, and LGG, benefit from accurate segmentation, existing ML validation rarely tests if an algorithm generalizes well to out-of-distribution data that reflects this tumor heterogeneity. Rebsamen et al. (<xref ref-type="bibr" rid="B40">2019</xref>) have shown that implicitly incorporating high-vs.-low tumor grade information in model training could improve model performance. While recent work has evaluated for tumor heterogeneity across geographic populations (McKinney et al., <xref ref-type="bibr" rid="B33">2020</xref>), hospital systems (Zech et al., <xref ref-type="bibr" rid="B65">2018</xref>), and federated learning settings (Sheller et al., <xref ref-type="bibr" rid="B50">2020</xref>), this has yet to be done considering differences between HGG, for example GBM and LGG imaging presentations.</p>
<p>In this work, we address the need for rigorous evaluation of ML algorithms for brain tumor segmentation. We propose a holistic evaluation framework (<xref ref-type="fig" rid="F1">Figure 1</xref>) that takes into account tumor heterogeneity, robustness, and confidence of the ML algorithm, and batch effects that may arise from the data. We demonstrate this framework with a cross-sectional study design similar to Zech et al. (<xref ref-type="bibr" rid="B65">2018</xref>) and analyze how well an ML algorithm trained on one glioma type (either HGG, exemplified by GBM or LGG) generalizes to another, out-of-distribution glioma type. We conduct four experiments and holistically evaluate an ML algorithm for the problem of tumor segmentation:</p>
<p><bold>Diagnostic Performance</bold>: We compute standard segmentation metrics to objectively compare the ML algorithm&#x00027;s segmentation performance against radiologist-annotated ground truth. Results indicate that metrics such as Dice and AUROC do not sufficiently capture differences in generalizability, although the classification matrix reveals clear differences.</p>
<p><bold>Model Confidence</bold>: We measure model confidence in segmentation performance by computing prediction intervals for the brain as well as tumor region. Results indicate that ML algorithms trained on LGG data is more confident than the rest on all homogeneous as well as mixed data.</p>
<p><bold>Robustness</bold>: We measure the ML algorithm&#x00027;s ability to maintain performance despite adversarial perturbations to test their reliability comparably. Results indicate that the ML algorithm trained only on GBM data was least robust when segmenting tumor corrupted with high levels of noise. Testing performance of the model across out of distribution data, was performed in all the experiments, but can be considered an extension of robustness testing.</p>
<p><bold>Data Quality (Batch Effects)</bold>: We measure the degree to which MRI scan quality influences segmentation metrics. Results found that scan quality features are not significantly correlated with performance, but that there were some batch effect differences, primarily between LGG and GBM sites.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Simplified flowchart of different axes of holistic evaluation&#x02014;diagnostic performance, robustness, model confidence, and data quality. Axes are ordered by dependency and relation with each other. We recommend models to be evaluated with atleast one experiment on each of these axes. We evaluate two aspects of robustness, namely, closeness to decision boundary and generalizability on unseen glioma type. Decision points in the framework lead to alternate paths for researchers to follow.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-15-740353-g0001.tif"/>
</fig>
<p>Our results demonstrate the limitations of segmentation metrics, and caution that metrics alone do not capture all aspects of an ML algorithm&#x00027;s performance. We discuss how our findings relate to recent literature in segmentation metrics. We further discuss how such a holistic evaluation framework could shed light on the algorithm&#x00027;s clinical utility in post-deployment scenarios and help it evolve into a more clinically valuable tool (Recht et al., <xref ref-type="bibr" rid="B41">2020</xref>).</p></sec>
<sec sec-type="materials and methods" id="s2">
<title>2. Materials and Methods</title>
<p>The aim of this work is to propose a framework to evaluate model performance across four axes&#x02014;diagnostic performance, model confidence, robustness, and data quality. To demonstrate this framework, we first train ML algorithms by considering tumor heterogeneity. We use publicly accessible code for algorithm development and perform <italic>post-hoc</italic> calibration.</p>
<sec>
<title>2.1. Dataset</title>
<p>We used publicly available Magnetic Resonance Imaging (MRI) from The Cancer Genome Atlas (TCGA) (Clark et al., <xref ref-type="bibr" rid="B13">2013</xref>). Glioblastoma Multiforme (GBM) and Low Grade Glioma (LGG) collection (Bakas et al., <xref ref-type="bibr" rid="B2">2017a</xref>,<xref ref-type="bibr" rid="B3">b</xref>). This included the skull-stripped and co-registered MICCAI-BraTS 2018 Test Dataset (Menze et al., <xref ref-type="bibr" rid="B35">2015</xref>; Bakas et al., <xref ref-type="bibr" rid="B4">2017c</xref>). The data consisted of pre-operative multimodal MR imaging sequences (i.e., T1, T1-Gd, T2, T2-FLAIR) along with their whole-tumor segmentation labels composed of edema, enhancing tumor, and non-enhancing tumor. We combined these labels into a single whole tumor for this study. Number of patients in GBM BraTS Test Dataset and LGG BraTS Test Dataset were split approximately in half and allotted to validation and test datasets. The GBM and LGG data were merged across the three categories to form an ALL dataset. Each patient was associated with 144 pre-operative MRI scans, which were treated as independent data points for 2D segmentation. These MRI scans were cropped to 144 &#x000D7; 144 pixels and further pre-processed the data by pixel-intensity normalization. <xref ref-type="table" rid="T1">Table 1</xref> describes the total number of patients and total number of MRI scans available in each dataset. The training datasets were used for model development (section 2.2), validation datasets were used to determine hyperparameters and calibrate the models (section 2.3), and test datasets (<italic>D</italic><sub><italic>GBM</italic></sub>, <italic>D</italic><sub><italic>LGG</italic></sub>, <italic>D</italic><sub><italic>ALL</italic></sub>) were used to perform subsequent experiments (section 3).</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Split of patients in each of the three datasets.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>GBM patients</bold></th>
<th valign="top" align="center"><bold>LGG patients</bold></th>
<th valign="top" align="center"><bold>ALL patients</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Train</td>
<td valign="top" align="center">102 (14,688)</td>
<td valign="top" align="center">65 (9,360)</td>
<td valign="top" align="center">167 (24,048)</td>
</tr>
<tr>
<td valign="top" align="left">Validation</td>
<td valign="top" align="center">16 (2,304)</td>
<td valign="top" align="center">21 (3,024)</td>
<td valign="top" align="center">37 (5,328)</td>
</tr>
<tr>
<td valign="top" align="left">Test</td>
<td valign="top" align="center">17 (2,448)</td>
<td valign="top" align="center">22 (3,168)</td>
<td valign="top" align="center">39 (5,616)</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>Values in brackets (.) indicate the total number of images available in the dataset for 2D segmentation. Note that henceforth, we refer to the test dataset as D<sub>GBM</sub> (GBM patients only), D<sub>LGG</sub> (LGG patients only), and D<sub>ALL</sub> (All patients&#x02014;GBM and LGG patients)</italic>.</p>
</table-wrap-foot>
</table-wrap></sec>
<sec>
<title>2.2. Network Architecture and Training</title>
<p>We used the state-of-the-art U-Net architecture (Ronneberger et al., <xref ref-type="bibr" rid="B46">2015</xref>) to develop three tumor segmentation models using the GBM, LGG, and ALL train datasets. The U-Net architecture consists of an encoder, decoder, and skip connections. Each module of the encoder consists of 2D Convolution layers, followed by Batch Normalization and MaxPooling layers. Four such modules make up the encoder. The decoder consists of four modules of Conv2DTranspose layers followed by Concatenate layers. The network performs slice-wise (2D) segmentation with multi-modal MRI scans provided as the input. Models were trained with Dice Loss function for 100 epochs on 8 GPUs. Adam optimizer (Kingma, <xref ref-type="bibr" rid="B25">2015</xref>) was used with a learning rate of 1 &#x000D7; 10<sup>&#x02212;4</sup> and a batch size of 128. Data augmentation was used while training each of the models to improve generalization. This consisted of random rotations (0&#x02013;25&#x000B0; degrees range), random zooming (value = 0.2, zooms image by 80&#x02013;120% range), width shift (value = 0.2, horizontal translation of images by 0.2 percent), height shift (value = 0.2, vertical translation of images by 0.2 percent), shear (value = 0.2, clips the image in counter-clockwise direction) and random horizontal flips. We referred to publicly available code for model development, model training, and data augmentation (Dong et al., <xref ref-type="bibr" rid="B16">2017</xref>; Ojika et al., <xref ref-type="bibr" rid="B38">2020</xref>).</p></sec>
<sec>
<title>2.3. Model Calibration</title>
<p>The goal of model calibration is to align the algorithm&#x00027;s predicted probabilities align with the observed (ground truth) outcomes (Guo et al., <xref ref-type="bibr" rid="B21">2017</xref>). Calibration process ensure that algorithms do not overstate or understate their confidence in prediction of tumor (Jungo and Reyes, <xref ref-type="bibr" rid="B22">2019</xref>; Mehrtash et al., <xref ref-type="bibr" rid="B34">2020</xref>). Models that have been already trained can be calibrated with <italic>post-hoc</italic> methods (Rousseau et al., <xref ref-type="bibr" rid="B47">2021</xref>). Guo et al. (<xref ref-type="bibr" rid="B21">2017</xref>) recommend performing post-hoc calibration with the same validation dataset (<xref ref-type="table" rid="T1">Table 1</xref>) used for model development. We use Platt Scaling technique (Platt, <xref ref-type="bibr" rid="B39">1999</xref>) for post-hoc calibration due to its simplicity and ease of implementation. To ensure models are properly calibrated, we compute and report common calibration metrics. Negative Log Likelihood (NLL) measures a probabilistic model&#x00027;s quality and is also known as cross-entropy loss. Brier Score (BS) measures the accuracy of probabilistic predictors. Percentage Expected Calibration Error (ECE%) partitions the model&#x00027;s predictions into equally spaced bins and takes a weighted average of the difference between accuracy and model confidence across bins. Percentage maximum calibration error (MCE%) estimates the worst-case deviation between confidence and accuracy. For metric definitions and more information, we refer readers to Mehrtash et al. (<xref ref-type="bibr" rid="B34">2020</xref>) and Guo et al. (<xref ref-type="bibr" rid="B21">2017</xref>). <xref ref-type="table" rid="T2">Table 2</xref> indicates that all models are properly calibrated.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>We first compute calibration metrics on a patient-level, then aggregated by mean.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="middle" align="left" rowspan="2"><bold>Metrics</bold></th>
<th valign="top" align="center" colspan="2" style="border-bottom: thin solid #000000;"><bold><italic>M</italic><sub><italic><bold>GBM</bold></italic></sub></bold></th>
<th valign="top" align="center" colspan="2" style="border-bottom: thin solid #000000;"><bold><italic>M</italic><sub><italic><bold>LGG</bold></italic></sub></bold></th>
<th valign="top" align="center" colspan="2" style="border-bottom: thin solid #000000;"><bold><italic>M</italic><sub><italic><bold>ALL</bold></italic></sub></bold></th>
</tr>
<tr>
<th valign="top" align="center"><bold>Before</bold></th>
<th valign="top" align="center"><bold>After</bold></th>
<th valign="top" align="center"><bold>Before</bold></th>
<th valign="top" align="center"><bold>After</bold></th>
<th valign="top" align="center"><bold>Before</bold></th>
<th valign="top" align="center"><bold>After</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">NLL</td>
<td valign="top" align="center">0.038212</td>
<td valign="top" align="center">0.013506</td>
<td valign="top" align="center">0.070146</td>
<td valign="top" align="center"><bold>0.022842</bold></td>
<td valign="top" align="center">0.056573</td>
<td valign="top" align="center">0.018483</td>
</tr>
<tr>
<td valign="top" align="left">BS</td>
<td valign="top" align="center">0.003519</td>
<td valign="top" align="center"><bold>0.002970</bold></td>
<td valign="top" align="center">0.006020</td>
<td valign="top" align="center">0.005263</td>
<td valign="top" align="center">0.004533</td>
<td valign="top" align="center">0.003862</td>
</tr>
<tr>
<td valign="top" align="left">ECE%</td>
<td valign="top" align="center">0.3413</td>
<td valign="top" align="center">0.1439</td>
<td valign="top" align="center">0.5877</td>
<td valign="top" align="center">0.3141</td>
<td valign="top" align="center">0.4454</td>
<td valign="top" align="center"><bold>0.1876</bold></td>
</tr>
<tr>
<td valign="top" align="left">MCE%</td>
<td valign="top" align="center">36.4552</td>
<td valign="top" align="center">14.0762</td>
<td valign="top" align="center">31.9731</td>
<td valign="top" align="center">14.3702</td>
<td valign="top" align="center">37.0614</td>
<td valign="top" align="center"><bold>13.8812</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>We consider only pixels in the skull-stripped brain to compute these metrics. ECE and MCE are presented in %. Metrics should ideally reduce upon calibration. Columns under each model indicate metric values before and after calibration. Bold values indicate best % decrease or increase as compared to the &#x0201C;before&#x0201D; column. All models improved after calibration</italic>.</p>
</table-wrap-foot>
</table-wrap></sec></sec>
<sec id="s3">
<title>3. Experiments</title>
<p>Here, we perform an experiment on each of the four axes of our evaluation framework. We compute metrics to summarize diagnostic performance, measure model confidence by computing prediction intervals, simulate adversarial attacks to assess robustness and use MRQy package to analyze batch effects in data. For each experiment, we point to related work, and provide details on the experiment procedure. Then, in section 4, we provide the outcome of these experiments. We evaluate each of the calibrated ML algorithms (<italic>M</italic><sub><italic>GBM</italic></sub>, <italic>M</italic><sub><italic>LGG</italic></sub>, and <italic>M</italic><sub><italic>ALL</italic></sub>) on each of the three test datasets (<italic>D</italic><sub><italic>GBM</italic></sub>, <italic>D</italic><sub><italic>LGG</italic></sub>, and <italic>D</italic><sub><italic>ALL</italic></sub>). Thus, we evaluate 3 (models) &#x000D7; 3 (datasets) &#x0003D; 9 conditions.</p>
<sec>
<title>3.1. Metrics for Segmentation Performance</title>
<p>There exist a plethora of metrics to evaluate the performance of a medical image segmentation algorithm (Udupa et al., <xref ref-type="bibr" rid="B57">2006</xref>; Taha and Hanbury, <xref ref-type="bibr" rid="B53">2015</xref>). Each metric focuses on a specific aspect of the algorithm&#x00027;s performance, and is thus limited in capability to describe the algorithm&#x00027;s performance by itself. Several metrics are necessary to describe comprehensive characteristics of segmentation performance (Renard et al., <xref ref-type="bibr" rid="B44">2020</xref>).</p>
<p>We perform this experiment as a baseline, reflective of the current standard practice for evaluation. We follow the guidelines described by Taha and Hanbury (<xref ref-type="bibr" rid="B53">2015</xref>) and select eight metrics to evaluate segmentation performance. Sensitivity (Sens) measures the proportion of tumor pixels that are correctly identified as tumor (foreground). Specificity (Spec) measures the proportion of benign pixels that are correctly identified as benign (background). Positive Predictive Value (PPV) measures the probability that pixels classified as benign truly belong to parts of the patients&#x00027; brain without a tumor. Negative Predictive Value (NPV) measures the probability that pixels classified as tumor truly belong to parts of the patients&#x00027; brain with a tumor. While accuracy can be skewed due to the paucity of tumor pixels in the tumor class, Balanced Accuracy (BAcc) takes into account class imbalance. Dice Coefficient (Dice) and Jaccard Coefficient (Jac.C) both measure the overlap between tumor annotated by the different sources (ML algorithm and the radiologists&#x00027; manual annotations). Area under Receiver Operating Characteristics curve (AUROC) describes the probability that a randomly selected tumor pixel will have a higher predicted probability of being a tumor than a randomly selected benign pixel. We eliminate any extra-cranial regions and only consider the skull-stripped brain for computing the metrics. We compute metrics on a per-patient level, as it offers more granularity than at a population-level.</p></sec>
<sec>
<title>3.2. Prediction Intervals for Model Confidence</title>
<p>Prediction Intervals (PIs) are often reported and considered for medical decision-making (K&#x000FC;mmel et al., <xref ref-type="bibr" rid="B28">2018</xref>). In radiation oncology, Chan et al. (<xref ref-type="bibr" rid="B11">2008</xref>) used prediction intervals to capture uncertainty in tumor and organ movement. While a confidence interval measures the precision of a predicted value, PIs measure the expected range where a future observation would fall, given what has already been observed. The width of the PI is directly proportional to the model uncertainty at that region (Kabir et al., <xref ref-type="bibr" rid="B23">2018</xref>). We use prediction intervals to quantify uncertainty in tumor segmentation.</p>
<p>We use Conformal Quantile Regression (CQR) (Romano et al., <xref ref-type="bibr" rid="B45">2019</xref>) to compute PIs. Construction of PIs is difficult, as PIs can be too small that they don&#x00027;t capture the true magnitude (Type 1 error) or too large that they are uninformative (Type 2 error) (Elder et al., <xref ref-type="bibr" rid="B17">2021</xref>). The CQR method guarantees construction of PI such that the target value is contained within the PI by error probability &#x003B1; (valid coverage) and that the PIs are informative.</p>
<p>We used the CQR method to compute PIs in a <italic>post-hoc</italic> manner. The method uses a dataset for training the CQR models and a separate test dataset to compute the PIs. To reduce computational cost, we selected summary images (image with the largest tumor) for each patient in the validation and test datasets (<xref ref-type="table" rid="T1">Table 1</xref>). We designed a setup to generate prediction intervals around the calibrated model values. We first obtained logits (model output before the calibration) for the selected summary images for patients in both datasets. The CQR models were trained on validation dataset logits and the corresponding calibrated model predictions as target values. The trained CQR models were then used to compute prediction intervals for test dataset logits. We followed the method described by Romano et al. (<xref ref-type="bibr" rid="B45">2019</xref>) to compute average prediction intervals (API) per-patient in the test set. We then generated API box plots for all nine conditions.</p></sec>
<sec>
<title>3.3. Adversarial Attacks for Robustness</title>
<p>This experiment was designed to test the impact of data quality and potential batch effects on the predictions of the model. There has been a lot of work in other domains on evaluating the adversarial robustness of ML algorithms. The application of imperceptible noise can change the prediction of image classification system from correctly identifying a panda to confidently miscalling the image a gibbon (Goodfellow et al., <xref ref-type="bibr" rid="B20">2015</xref>). There are now a variety of adversarial attack techniques, from white-box techniques that can look inside the algorithm to those that can build attacks simply by testing inputs and outputs. These techniques can provide a useful framework for evaluating the robustness of a medical imaging machine learning system. In tumor imaging in general, Zwanenburg et al. (<xref ref-type="bibr" rid="B66">2019</xref>) showed how radiomics features can be evaluated for robustness by perturbing the tumor mask. Understanding how vulnerable ML algorithms are to noise, and how easily they change their decisions in response, gives a sense of how these ML algorithms might fail.</p>
<p>The adversarial attack used in this experiment was fast gradient signed method (FGSM), described by Goodfellow et al. (<xref ref-type="bibr" rid="B20">2015</xref>). This technique is a white-box method which takes the calculated gradient of the neural network to find the direction of the smallest change that will affect the label of the output. This gradient adversarial noise is multiplied by a factor of epsilon, to vary the strength of the attack. In these experiments the epsilon factor was varied over a range of 0&#x02013;1 (0, 0.005, 0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0), with more examples on the lower end of the range to evaluate small perturbations.</p>
<p>We performed the FGSM attack on each of the test datasets (<italic>D</italic><sub><italic>GBM</italic></sub>, <italic>D</italic><sub><italic>LGG</italic></sub>, and <italic>D</italic><sub><italic>ALL</italic></sub>), for all three ML algorithms (<italic>M</italic><sub><italic>GBM</italic></sub>, <italic>M</italic><sub><italic>LGG</italic></sub>, and <italic>M</italic><sub><italic>ALL</italic></sub>). The full panel of metrics was computed for each of these experiments. The performance of the ML algorithms was expected to decay as epsilon decreased, but the relative robustness of each of the ML algorithms and the way that they decayed was studied as well. The chosen epsilon values were (0, 0.005, 0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, and 1). An epsilon of 0 indicates that no change was made to the image.</p></sec>
<sec>
<title>3.4. MRQy for Analyzing Batch Effects</title>
<p>Magnetic resonance imaging has many strengths in studying and monitoring cancer status, including a variety of sequences to investigate different aspects of tumors. However, the flexibility it provides to radiologists can lead to inconsistencies in protocol and scan quality. MRQy is MRI quality package that provides a variety of features that assess the quality of a scan, and other effects that might be considered batch effects (Sadri et al., <xref ref-type="bibr" rid="B48">2020</xref>). The complexity of machine learning algorithms makes it possible for them to pick up on batch effects between sites rather than the underlying biology of a problem.</p>
<p>These MRQy factors were used to audit the susceptibility of the different ML algorithms to scan quality factors. For each of the MRI sequences, MRQy features were calculated independently on the original NIFTI files. The features used per modality were: MEAN, RNG, VAR, CV, PSNR, SNR1, SNR2, SNR3, SNR4, CNR, CVP, CJV, EFC, TSNEX, TSNEY, UMAPX, UMAPY (For metric definitions, Sadri et al., <xref ref-type="bibr" rid="B48">2020</xref>). The metadata and size features were excluded as they were not available, and the sizing was consistent across all the images. The average true positive probability of a tumor pixel having a tumor label was calculated, as well as for true negative, false positive and false negative pixels. These were calculated on a per patient level and then averaged across all the patients in the test set. These values along with Dice score and AUROC were then assessed for their correlation with the MRQy features using Spearman correlation coefficient. MRQy features that are correlated with model performance are potential quality control metrics that might be used to flag problematic cases. False discovery rate (FDR) correction was then performed using Benjamini-Hochberg correction at an alpha of 0.25 (Benjamini and Hochberg, <xref ref-type="bibr" rid="B6">1995</xref>). We used this correction as it is less stringent than a more aggressive Bonferroni correction and was still found to eliminate the uncorrected <italic>p</italic>-values.</p>
<p>Additionally, independent of the metrics, batch effects were investigated using the MRQy parameters to compare TCGA site codes in the combined testing data set (<italic>D</italic><sub><italic>ALL</italic></sub>). The MRQy features were normalized then decomposed using principal component analysis (Tipping and Bishop, <xref ref-type="bibr" rid="B56">1999</xref>). The first two MRQy principal components and their relationship to institution were investigated using ANOVA and paired <italic>T</italic>-tests in the statsmodels python package (Seabold and Perktold, <xref ref-type="bibr" rid="B49">2010</xref>). We hypothesized that some site differences within the data sets might be captured by this dimensionality reduction.</p></sec></sec>
<sec sec-type="results" id="s4">
<title>4. Results</title>
<p>In this section, we present and analyze the results of the four experiments in section 3. We discuss their implications in section 6. Note that we perform these experiments for the pixels within the skull-stripped brain.</p>
<sec>
<title>4.1. Metrics Alone Do Not Sufficiently Describe the Nature and Severity of Segmentation Mistakes</title>
<p>True Negative (TN) panel in <xref ref-type="fig" rid="F2">Figure 2</xref> indicates all models perform equally well in identifying benign pixels. <italic>M</italic><sub><italic>ALL</italic></sub> has the highest percentage TP, indicating the best performance at correctly identifying tumor pixels. On average, due to a higher percentage of False Negatives than False Positives, all algorithms (<italic>M</italic><sub><italic>LGG</italic></sub>,<italic>M</italic><sub><italic>GBM</italic></sub>,<italic>M</italic><sub><italic>ALL</italic></sub>) under-segment tumor more often than they over-segment. The FP value is highest for <italic>M</italic><sub><italic>LGG</italic></sub>. Thus, out of all models, <italic>M</italic><sub><italic>LGG</italic></sub> classifies benign regions as tumor the most (over-segments). The FN value is highest for <italic>M</italic><sub><italic>GBM</italic></sub>, on average. <italic>M</italic><sub><italic>GBM</italic></sub> thus, under-estimates tumor pixels and classifies them as benign (under-segments).</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Confusion Matrix to assess the performance of M<sub><italic>GBM</italic></sub>, M<sub><italic>LGG</italic></sub>, and M<sub><italic>ALL</italic></sub> across stratified and composite datasets. The y-axis denotes percentage of total pixels in a test dataset classified as TP, FN, FP, TN. <italic>M</italic><sub><italic>LGG</italic></sub> has the tendency to over-segment (high %FP), while <italic>M</italic><sub><italic>GBM</italic></sub> has the tendency to under-segment tumor(high %FN), relative to each other. Note that metrics such as Dice coefficient routinely ignore the background (TN) in a segmentation context, so a 0.1% difference in false positives should be understood relative to the 6&#x02013;9% of the volume that is tumor.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-15-740353-g0002.tif"/>
</fig>
<p>The training of the algorithms further explains these findings. <italic>M</italic><sub><italic>LGG</italic></sub> learns to pick up subtle patterns in the training phase, and when evaluated on <italic>D</italic><sub><italic>GBM</italic></sub>, classifies normal-appearing tissue as part of a tumor. In contrast, <italic>M</italic><sub><italic>GBM</italic></sub> is used to seeing dominant contrast patterns, which explains why it misses a lot of tumor pixels in LGG.</p>
<p>In <xref ref-type="fig" rid="F3">Figure 3</xref>, all models have similarly worse performance on some patients, indicated by red rows. This is visible across all test datasets. This could be due to multiple confounding variables such as different vendors, field strengths, parameters of imaging, strength of the imaging magnet, type of machine, and it is difficult to pinpoint the contributing factor. Metrics show similar trends in all conditions. Models have a high specificity, low sensitivity, and a high AUROC. There is an overall trend of NPV being higher than PPV. These findings reflect the effect of class imbalance in the dataset, and the models&#x00027; ability to recognize benign areas much more easily than tumor regions.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Heat maps indicating patient-level performance metrics. Rows represent test datasets (<italic>D</italic><sub><italic>GBM</italic></sub>, <italic>D</italic><sub><italic>LGG</italic></sub>, <italic>D</italic><sub><italic>ALL</italic></sub>) and columns represent ML algorithms (<italic>M</italic><sub><italic>GBM</italic></sub>, <italic>M</italic><sub><italic>LGG</italic></sub>, <italic>M</italic><sub><italic>ALL</italic></sub>). <italic>D</italic><sub><italic>ALL</italic></sub> is formed by concatenating the first two rows. In each individual heat map, rows represent model performance on a particular test dataset and columns represent segmentation metrics. Patients for whom all models perform similarly worse are indicated in red.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-15-740353-g0003.tif"/>
</fig></sec>
<sec>
<title>4.2. Example Illustrations</title>
<p>Here, we present example patients (<xref ref-type="fig" rid="F4">Figures 4</xref>&#x02013;<xref ref-type="fig" rid="F7">7</xref>) with the Ground Truth (GT) tumor and tumor segmentation contours of <italic>M</italic><sub><italic>GBM</italic></sub>, <italic>M</italic><sub><italic>LGG</italic></sub>, and <italic>M</italic><sub><italic>ALL</italic></sub>. We selected good and bad segmentation examples from <italic>D</italic><sub><italic>GBM</italic></sub> and <italic>D</italic><sub><italic>LGG</italic></sub> each for qualitative analysis. One of the authors, who is a board-certified neuroradiologist of more than a decade of experience in brain tumor diagnosis, interpreted these images.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Patient TCGA-06-0168 is diagnosed with GBM in the right temporal operculum. <italic>M</italic><sub><italic>LGG</italic></sub> has low performance on Dice Coefficient (Dice = 0.6847) than <italic>M</italic><sub><italic>GBM</italic></sub> (Dice = 0.8103) and <italic>M</italic><sub><italic>ALL</italic></sub> (Dice = 0.8616). AUROC for all models is high despite unequal performance. The boundary of the edema on FLAIR sequence shows where <italic>M</italic><sub><italic>LGG</italic></sub> over-segments and <italic>M</italic><sub><italic>GBM</italic></sub> under-segments tumor.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-15-740353-g0004.tif"/>
</fig>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>Patient TCGA-HT-7874 belongs to <italic>D</italic><sub><italic>LGG</italic></sub> and has a tumor in the right frontal lobe. We selected this patient as it has consistently worse performance for metrics (Sens, B.Acc, Dice, Jac.C) across all models. Segmentation plot indicates <italic>M</italic><sub><italic>All</italic></sub> and <italic>M</italic><sub><italic>GBM</italic></sub> under-segment in this case, whereas <italic>M</italic><sub><italic>LGG</italic></sub> over-segments. <italic>M</italic><sub><italic>ALL</italic></sub> appears to be missing a central part of the tumor, as seen in the coronal and sagittal image planes. <italic>M</italic><sub><italic>LGG</italic></sub> appears to extend well beyond the region of FLAIR enhancement to over-segment the tumor. This LGG was significantly larger than most LGGs, and that may contribute to the difficulty of segmentation.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-15-740353-g0005.tif"/>
</fig>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>Patient TCGA-12-1093 belongs to <italic>D</italic><sub><italic>GBM</italic></sub> and has a tumor in the left parietal lobe. We selected this patient as an example because it has consistently good performance for metrics (Sens, B.Acc, Dice, Jac.C) across all models. This GBM has clear margins, and a sharp boundary on FLAIR enhancing regions. The enhancing tumor core is central and distinct, and the models all perform relatively consistently in segmentation.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-15-740353-g0006.tif"/>
</fig>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>Patient TCGA-DU-6400 belongs to <italic>D</italic><sub><italic>LGG</italic></sub> and has a tumor in the left temporal parietal region. We selected this patient as an example because it has consistently good performance for metrics (Sens, B.Acc, Dice, Jac.C) across all models. This LGG has clear margins, and the classic signature of FLAIR enhancement and no T1-Gd enhancement.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-15-740353-g0007.tif"/>
</fig></sec>
<sec>
<title>4.3. <italic>M</italic><sub><italic>LGG</italic></sub> Has the Greatest Confidence for Segmentation Across All Datasets</title>
<p>Violin plots were constructed to analyze average model confidence across all patients. <xref ref-type="fig" rid="F8">Figure 8</xref> depicts the average prediction intervals for the skull-stripped brain region. Models have approximately the same median average prediction intervals (API) on each test dataset. <xref ref-type="fig" rid="F9">Figure 9</xref> represents model confidence while identifying tumor regions. Models have wider inter-quartile range and greater variability compared to <xref ref-type="fig" rid="F8">Figure 8</xref>. This indicates models have low confidence in identifying tumors as compared to non-tumor. <italic>M</italic><sub><italic>GBM</italic></sub> and <italic>M</italic><sub><italic>ALL</italic></sub> have similar distributions of API across patients, indicating both models are similarly confident while segmenting both GBM and LGG tumor. <italic>M</italic><sub><italic>LGG</italic></sub> has the lowest median prediction interval widths, and their distribution has the lowest variability and highest concordance. This indicates <italic>M</italic><sub><italic>LGG</italic></sub> is the most confident model while segmenting both LGG and GBM patients. Out of all models, <italic>M</italic><sub><italic>LGG</italic></sub> is consistently confident while making predictions.</p>
<fig id="F8" position="float">
<label>Figure 8</label>
<caption><p>The violin plot indicates models have equal median confidence while segmenting GBM and LGG patients due to greater number of non-tumor pixels in the datasets. The x-axis represents the datasets. The y-axis represents average prediction intervals. Models are sorted by hues and grouped together by test dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-15-740353-g0008.tif"/>
</fig>
<fig id="F9" position="float">
<label>Figure 9</label>
<caption><p>Violin plots constructed to correct for effects of class imbalance and analyze model confidence while identifying tumor pixels only. Plots indicate models confidence is less consistent in identifying tumors due to wider inter-quartile range and greater spread of prediction interval distribution. Plot indicates <italic>M</italic><sub><italic>LGG</italic></sub> is the most confident model while segmenting both LGG and GBM patients.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-15-740353-g0009.tif"/>
</fig>
<p><italic>M</italic><sub><italic>LGG</italic></sub> has the highest confidence, even though it makes mistakes (over-segments) in segmentation, suggestive of an aggressive approach. <italic>M</italic><sub><italic>GBM</italic></sub> also makes mistakes (under-segments) but has lower confidence, which suggests a cautious approach. LGG may be monitored for a longer period of time, so a high rate of false positives can overburden clinicians, going against the goal of reducing their burden. If mistakes are very obvious, it can cause a high degree of frustration and eventual abandonment of the algorithm (Beede et al., <xref ref-type="bibr" rid="B5">2020</xref>). Previous works have proposed monitoring cases with low confidence (Kompa et al., <xref ref-type="bibr" rid="B27">2021</xref>). However, in a case where a model makes mistakes with high confidence, a confidence-based screening approach might cause the reviewer to miss important areas of model failure.</p></sec>
<sec>
<title>4.4. Models Trained on <italic>D</italic><sub><italic>GBM</italic></sub> Deteriorated the Most Under Adversarial Attacks</title>
<p>The three models (<italic>M</italic><sub><italic>GBM</italic></sub>, <italic>M</italic><sub><italic>LGG</italic></sub>, <italic>M</italic><sub><italic>ALL</italic></sub>) were each evaluated on the three test datasets under FGSM attack across a range of epsilons from 0 to 1. The 95% confidence intervals are also included for each of the metrics that were evaluated on a per patient level. <italic>M</italic><sub><italic>GBM</italic></sub> was the least robust to this type of FGSM attack, across all three test datasets for AUROC, Dice score, and Sensitivity. This might be due to the somewhat consistent imaging presentation of glioblastomas. It was marginally more robust to attack on its own datatype (<italic>D</italic><sub><italic>GBM</italic></sub>). All three models failed by losing sensitivity instead of specificity, indicating that the models began drastically under-segmenting the tumor under high levels of noise. <xref ref-type="fig" rid="F10">Figure 10</xref> highlights the model behavior under different levels of noise. Under smaller amounts of noise (<xref ref-type="fig" rid="F11">Figure 11</xref>), the all model had the best performance generally, though not significantly. <italic>M</italic><sub><italic>LGG</italic></sub> and <italic>M</italic><sub><italic>GBM</italic></sub> had the highest AUROC values of the three models for <italic>D</italic><sub><italic>LGG</italic></sub> and, <italic>D</italic><sub><italic>GBM</italic></sub> respectively, though the differences did not reach the significance threshold of (<italic>p</italic> &#x0003C; 0.05).</p>
<fig id="F10" position="float">
<label>Figure 10</label>
<caption><p>Robustness of each model under FGSM attack, across the full range of epsilons (0&#x02013;1.0) for four selected metrics. Ninety-five percent confidence intervals are provided to each model, and each of the three data sets were evaluated. <italic>M</italic><sub><italic>GBM</italic></sub> was least robust to FGSM attack at higher epsilon values with regard to AUROC, Dice score, and sensitivity.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-15-740353-g0010.tif"/>
</fig>
<fig id="F11" position="float">
<label>Figure 11</label>
<caption><p>Robustness of each model under FGSM attack, zoomed in on the early range of epsilons (0&#x02013;0.2) for four selected metrics. Ninety-five percent confidence intervals are provided to each model, and each of the three data sets were evaluated. Models had more similar performance in the less aggressive levels of attack, with all model having marginally better performance, except with <italic>M</italic><sub><italic>LGG</italic></sub> and <italic>M</italic><sub><italic>GBM</italic></sub> models performing better with AUROC on their own test data sets.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-15-740353-g0011.tif"/>
</fig>
<p>We found that the models trained only on <italic>D</italic><sub><italic>GBM</italic></sub> were less robust to adversarial noise, particularly at high levels of adversarial noise. These levels of noise may be extreme, but do give some sense of the performance of the models under duress. Other types of attacks that might be worthwhile to investigate include: adversarial patch attacks, Carlini and Wagner attacks, projected gradient descent, as well as GAN based attacks (Carlini and Wagner, <xref ref-type="bibr" rid="B9">2017</xref>; Brown et al., <xref ref-type="bibr" rid="B7">2018</xref>; Ren et al., <xref ref-type="bibr" rid="B43">2020</xref>). This is not the only way of assessing robustness of models, as it assumes a motivated attacker to guide attacks, as opposed to natural sources of error, but it addresses how the margins of the tumor are affected on a consistent scale across the models. Natural sources of error are less coherent, comparable, and not as well computationally modeled in MRI as the body of work on adversarial attacks.</p></sec>
<sec>
<title>4.5. MRQy Features Vary Between Data Sets and Institutions, but Are Not Significantly Correlated With Metrics</title>
<p>The calibrated models&#x00027; metrics and probabilities were evaluated for correlations with MRQy parameters, across the different test datasets. While there were some limited parameters that had significant correlations with model metrics, this was before FDR correction. One Thousand two hundred and twenty-four parameter to metric comparisons (17 MRQy parameters, 4 sequences, 6 metrics, 3 models) were performed, and none of the parameter-metric pairs were significantly correlated after FDR correction (<italic>p</italic> &#x0003C; 0.05). The MRQy features were collected before preprocessing, and were shown to be different across different institutions. However, the model used preprocessed data, and the MRQy features were not significantly correlated with the models&#x00027; predictions and performance. This negative result adds more confidence to the predictions of the machine learning pipeline.</p>
<p>The PCA analysis showed that there were significant differences between three groups of site codes. The first cluster of institutions was 12, 06, and 08, the second was HT, DU, CS and FG, and the last was 02. Paired <italic>t</italic>-tests showed that the first principal component created splits with significant differences (<italic>p</italic> &#x0003C; 0.05). Notably, the numerical codes (02, 06, 08, 12) correspond to GBM studies, and alpha codes corresponded to LGG studies (HT, DU, CS, FG). However, within these clusters, the differences didn&#x00027;t reach significance. <xref ref-type="fig" rid="F12">Figure 12</xref> shows the site codes plotted in PCA space, and then the three models with Dice coefficient. The fact that Henry Ford Hospital (06 for GBM and DU for LGG) had more in common with other GBM and LGG sites than between those two groups is notable, though hard to explain with such a limited sample size. Site 02 was also an outlier from both other clusters in this PCA space, and had relatively poor performance, though with one case it&#x00027;s hard to draw a firm conclusion.</p>
<fig id="F12" position="float">
<label>Figure 12</label>
<caption><p>MRQy features after principal component analysis, plotted against site code, and Dice scores of three models. The <italic>D</italic><sub><italic>ALL</italic></sub> test dataset has the first two principal components of the MRQy features plotted. Pairwise <italic>t</italic>-tests found that three clusters had significant differences in PCA space (06, 08, 12), (HT, DU, CS, and FG), and (02). Notably, these numeric codes happen to correspond to GBM studies, and the letter codes happen to correspond to LGG studies.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-15-740353-g0012.tif"/>
</fig>
<p>The BraTS 2018 test datasets (Menze et al., <xref ref-type="bibr" rid="B35">2015</xref>) did not have significant correlations after FDR correction between scan quality and metrics. This could be due to the high-fidelity curation and good consistency of the dataset. Another potential explanation could be the limited size of the dataset. Still, these data quality metrics show significant correlations with TCGA sites after PCA analysis, indicating batch effect differences, at least between the GBM and LGG datasets. Other data quality issues that models should be tested for include bias based on race, sex, and socioeconomic status. The rise of federated learning models makes this more urgent, because they allow for training models across collaborators without sharing data (Kairouz et al., <xref ref-type="bibr" rid="B24">2021</xref>). Since sensitive data is not shared between sites, tracking batch effects and sources of bias requires more work and planning than if all the data were shared and managed centrally.</p></sec></sec>
<sec sec-type="discussion" id="s5">
<title>5. Discussion</title>
<p>In this work, we used publicly available data and compared three U-Net-based algorithms in a stratified manner. Our main finding is that traditional segmentation performance metrics do not capture all aspects of an algorithm&#x00027;s performance, and can be potentially misleading. In this section, we first discuss the limitations of segmentation metrics, and how our proposed evaluation framework leads to a better understanding of model performance. We discuss the four axes of evaluation&#x02014;diagnostic performance, model confidence, robustness, and analysis of batch effects in detail. Finally, we address the practical utility of our framework and list recommendations for model evaluation.</p>
<sec>
<title>5.1. Limitations of Segmentation Metrics</title>
<p>Despite the technological advancements of Machine Learning (ML), the adoption of Ml in clinical workflows remains limited (Caruana et al., <xref ref-type="bibr" rid="B10">2015</xref>; Strickland, <xref ref-type="bibr" rid="B52">2019</xref>; Beede et al., <xref ref-type="bibr" rid="B5">2020</xref>). This divide between the development and adoption of ML algorithms has been termed the &#x0201C;translation gap&#x0201D; (Steiner et al., <xref ref-type="bibr" rid="B51">2021</xref>). This limitation is in part due to lack of holistic evaluation of the performance of those ML systems.</p>
<p>Majority of existing algorithms are statistically validated only using segmentation metrics (van Kempen et al., <xref ref-type="bibr" rid="B58">2021</xref>), such as Dice Coefficient (Dice, <xref ref-type="bibr" rid="B15">1945</xref>). In our experiments, we followed guidelines (Taha and Hanbury, <xref ref-type="bibr" rid="B53">2015</xref>) to compute several segmentation metrics and test the differences between segmentation of GBM and LGG patients. We hypothesized that segmentation of LGG patients would be more difficult than GBM patients. LGG is diffuse and has low proliferation, which makes accurate segmentation of submicroscopic tumor tissues and tendrils, a difficult task. In contrast, GBM has greater signal intensity and characteristic presence of necrotic cavities, which makes segmentation comparatively more obvious. Our results found that metrics alone were insufficient to highlight the severity of mistakes that models make in segmentation. Only when segmentation contours were interpreted by a board-certified neuroradiologist, the degree, and types of errors of these models were evident. Similarly, in a recent systematic review of glioma segmentation algorithms, van Kempen et al. (<xref ref-type="bibr" rid="B58">2021</xref>) expected to find performance differences in segmentation of HGGs and LGGs but found that reported metrics could not capture such differences.</p>
<p>This points to a bigger concern raised by Reinke et al. (<xref ref-type="bibr" rid="B42">2021</xref>) that metrics alone are insufficient to evaluate all aspects of segmentation performance. While metrics are important for objective performance evaluation, they have several limitations for clinical utility (Maier-Hein et al., <xref ref-type="bibr" rid="B32">2018</xref>). Difference in consequences of an algorithm&#x00027;s errors cannot be uncovered by metrics alone, and requires a clinical expert to elucidate them. For example, the consequences of under-segmenting in <italic>D</italic><sub><italic>GBM</italic></sub> might be more severe than under-segmenting in <italic>D</italic><sub><italic>LGG</italic></sub> due to the prognosis and management of the two diseases. As LGGs may merit a more conservatory, &#x0201C;wait-and-watch&#x0201D; approach, tumor that might be previously missed can be caught with additional tests. However, segmentation in case of GBM has more immediate consequences for resection and radiotherapy. Under-segmentation in this case would result in non-total resection, and perhaps if tumor tissue remains, would increase the likelihood of recurrence. Over-segmentation on the other hand would cause removal of non-tumor regions of the brain, or subject them to higher levels of radiotherapy, potentially causing functional impairments for patients. In case of glioma, the Dice Coefficient has a limited utility for evaluation of multifocal lesions (Giannopoulos and Kyritsis, <xref ref-type="bibr" rid="B19">2010</xref>) because it cannot represent over-segmentation and under-segmentation (Yeghiazaryan and Voiculescu, <xref ref-type="bibr" rid="B64">2018</xref>), does not support segmentation of multiple structures (Yeghiazaryan and Voiculescu, <xref ref-type="bibr" rid="B64">2018</xref>), and is not immune to imaging artifacts and shape differences (Reinke et al., <xref ref-type="bibr" rid="B42">2021</xref>). This serves as a cautionary tale that metrics alone are insufficient for reporting model performance, and there is clearly a need for better evaluation and reporting standards (Nagendran et al., <xref ref-type="bibr" rid="B36">2020</xref>).</p>
<p>Since medical data is tightly controlled to protect patient privacy, federated learning has risen as a methodology to train models without exposing data. However, while the cross-site training structure has it&#x00027;s advantages, it requires thoughtful planning of model evaluation since model designers will not have access to the underlying data from other sites. Any metrics, quality control features, and batch effect monitoring will have to be carefully pre-planned to judge any resulting models. Thorough and holistic evaluation is especially important as site variability in protocol and patient populations is a known confounding factor. Our framework also helps illuminate the axes on which a federated learning network should judge their models beyond simple metrics like accuracy or AUROC.</p></sec>
<sec>
<title>5.2. Dimensions of the Evaluation Framework</title>
<p>The goal of our work is to inform how researchers can holistically evaluate their segmentation algorithms, and consider other axes of model performance than metrics alone. A problem faced by model developers in this domain is the lack of large datasets to effectively train and evaluate their algorithms. To realistically recreate this, we worked with smaller test datasets from TCGA-GBM and TCGA-LGG. Our work explores the effects of working with limited data, and informs how to interpret results meaningfully in such scenarios. Our experiments and methodology stand independently of whether the model evaluator has pre-built models, or is yet to train them. Our framework considers tumor heterogeneity, limitations of metrics and evaluates other axes such as model confidence, robustness, and batch effects. We don&#x00027;t suggest completely abandoning metrics&#x02014;they would be important as a start, to get some level of insight. However, we caution against solely relying on metrics, and propose a more holistic evaluation of algorithms. In <xref ref-type="fig" rid="F1">Figure 1</xref>, we map the axes of evaluation onto the standard ML pipeline. We provide other potential experiments that researchers can choose for model evaluation along specific axes. For example, techniques such as model ensembles and k-fold cross validation can be used to evaluate model confidence.</p>
<p>In our experiments, we evaluate model robustness with adversarial attacks. Recent work has shown the importance to evaluate the models&#x00027; abilities to withstand adversarial attacks, especially in high-stakes scenarios such as radiology (Wetstein et al., <xref ref-type="bibr" rid="B60">2020</xref>). These attacks can arise due to strong financial interests or technical infrastructure. We designed this experiment to test how and in what way could models fail in deployment under such an attack. This could lead to appropriate safeguards being put in place. Adversarial attacks also help shed light on the decision boundary of a neural network (Woods et al., <xref ref-type="bibr" rid="B63">2019</xref>), which is otherwise something of a black box. Other sources of noise could be added, but have their own complications. Adding Gaussian noise to the inputs can be difficult to calibrate and variable due to randomness. Addition of artifacts, such as motion artifacts, is complex to model, and tools for doing so are not publicly available. Further research should investigate models using these failure modes, but is outside the scope of this paper. Another axes we investigate is analyzing the dataset for batch effects. In the context of tumor segmentation, batch effects could occur when image acquisition parameters or technical variations correlate with measurement quantity (Sadri et al., <xref ref-type="bibr" rid="B48">2020</xref>). This may become a major problem when it leads to incorrect conclusions (Leek et al., <xref ref-type="bibr" rid="B30">2010</xref>), especially when ML algorithms learn to pick up on these patterns. Analyzing for batch effects thus becomes important, as model predictions can be correlated with confounding factors. Our experiments found that pre-processing might help in making MRI scans more homogeneous and reduce these correlations.</p>
<p>We demonstrated our evaluation framework on ML algorithms trained with reliable, high-fidelity. expert-annotated BraTS Datasets. To further simplify the process of model development, we used straightforward implementations such as fixed dataset split (testing/validation) and 2D segmentation to work with limited data. Model developers can certainly use more sophisticated techniques that result in higher accuracy. Despite these limitations, our experiments are aligned to the overall goal of this work. Another limitation is we consider LGG for evaluation of generalizability. While there are significant imaging differences as compared to GBM, LGG is a broad category consisting of a range of tumor types. A more clinically useful investigation would be to evaluate performance on WHO recognized genetic subtypes such as IDH-mutant vs IDH-wt or 1p/19q codeleted tumors, as the literature on tumor subtypes evolves (Louis et al., <xref ref-type="bibr" rid="B31">2016</xref>). However, we defer this as future work.</p></sec>
<sec>
<title>5.3. Recommendations for Evaluation of Tumor Segmentation Algorithms</title>
<p>Here, we summarize our work and presented the following recommendations for holistic evaluation of ML algorithms:</p>
<p><bold>Accounting for tumor heterogeneity in evaluation:</bold> We focus on a specific problem of glioma, and evaluate for differences in models trained by stratification of GBM and LGG Data. The first stage in standard of care for glioma is the identification of the type, which further dictates the prognosis and treatment planning. However, there is high variability in this stage, and experts often don&#x00027;t reach immediate consensus. It is thus important for ML algorithms to generalize well across all tumor grades. We set out to investigate this question, by performing holistic evaluation on LGG, GBM, and mixed data. Researchers should consider unique imaging presentations of each patient and evaluate on a patient-level, as important differences might be diminished upon aggregation of data. Researchers should avoid evaluation on a dataset-level.</p>
<p><bold>Adoption of tools in other domains to investigate glioma segmentation:</bold> Domains such as adversarial robustness and statistics have highly specialized tools (e.g., FGSM, conformal prediction intervals) to interrogate different aspects of model performance. In this work, we demonstrate the value of adopting such tools for the problem of performance evaluation of glioma segmentation. Our results indicate clear differences in these experiments. We found model trained on LGG Data to be more confident, and model trained on GBM to suffer the most under adversarial attacks. Researchers should evaluate their algorithms on each of the evaluation axes, by performing at least one experiment on each of the axes (<xref ref-type="fig" rid="F1">Figure 1</xref>).</p>
<p><bold>Exploring limitations of metrics in clinical utility:</bold> In recent years, the community has started to acknowledge the clinical limitations of standard segmentation metrics. Our work demonstrates why evaluation by metrics alone is limiting in investigating heterogeneity in clinical populations (i.e., GBM vs. LGG patients), and our findings further support recent literature. Researchers should avoid relying solely on metrics to evaluate their models.</p>
<p>The framework can further shed light on the practical utility of an algorithm, and serve as a decision-support tool. It is not meant to replace the triaging mechanisms already in place. Since the action that accompanies a decision is different, researchers should know the situations and the patient case before use of these algorithms. If the algorithm&#x00027;s prediction would be followed by a high-stakes action component such as surgery, tumor resection, or radiation therapy, accuracy of segmentation is critical. Our results indicate that algorithms trained on a specific glioma grade group do not generalize well out of distribution, so it is best to use specifically-trained models. For example, if a patient with GBM is to undergo surgery, use of <italic>M</italic><sub><italic>GBM</italic></sub> as a decision-support tool would be best. In low-stakes scenarios such as accessing the extent of tumor infiltration, generalizability is more important at the cost of accuracy. The use of <italic>M</italic><sub><italic>ALL</italic></sub>, which has knowledge of all glioma grade groups, would be best in this scenario.</p>
<p>Establishing a close collaboration with a clinical expert is crucial to ensure that results of the framework are appropriately interpreted. In this work, the authors collaborated with experts in neuroradiology and radiation oncology to deep-dive into the problem of brain tumor segmentation and present the limitations of metrics in a clinically meaningful way. Researchers should similarly consult a clinical expert to understand how tumor heterogeneity manifests in imaging presentations between the subgroups of the tumor they are interested to investigate. The use of this framework in other domains would thus require a close collaboration between ML researchers and clinicians for effective investigation.</p></sec></sec>
<sec sec-type="conclusions" id="s6">
<title>6. Conclusion</title>
<p>In this work, we proposed a framework to evaluate the performance of tumor segmentation algorithms. To illustrate the framework, we investigated the generalizability of algorithms in different glioma grade groups. Institutions such as the American College of Radiology, Data Science Institute (ACR DSI) often lay out guidelines to researchers for best practices before model deployment. However, it is often not clear to researchers on how to evaluate models. We take a more granular view and present a tutorial of sorts, in addition to proposing a holistic framework for better model evaluation. In addition, we provide the following recommendations to researchers: (1) Perform at least one experiment on model confidence, diagnostic performance, data quality and robustness. (2) Perform analysis on a per-patient basis. (3) Gather representative images informed by the results of such analysis. (4) Collaborate with a clinical expert to perform qualitative evaluation of these images to get deeper insight on model performance.</p></sec>
<sec sec-type="data-availability" id="s7">
<title>Data Availability Statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <ext-link ext-link-type="uri" xlink:href="https://www.med.upenn.edu/sbia/brats2018/data.html">https://www.med.upenn.edu/sbia/brats2018/data.html</ext-link>.</p></sec>
<sec id="s8">
<title>Author Contributions</title>
<p>NCW: data acquisition and pre-processing. VA, NCW, and SP: design of the experiments. SP and NCW: performing experiments and data analysis. AR, NCW, and SP: results interpretation. SP, NCW, NB, and XH: writing of the manuscript. AR: conception and design of study project and supervision. NB and XH: co-advising. JRB: clinical interpretation and guidance. All authors contributed to the article and approved the submitted version.</p></sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>SP was supported by UM-MICDE Catalyst Grant (to XH and AR). AR was supported by CCSG Bioinformatics Shared Resource 5 P30 CA046592, a Research Scholar Grant from the American Cancer Society (RSG-16-005-01), and a Precision health Investigator award from U-M Precision Health to AR (along with L. Rozek and M. Sartor). SP and AR were also partially supported by the NCI Grant R37-CA214955. AR, NCW, XH, and NB were also supported by the University of Michigan (U-M) startup institutional research funds.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of Interest</title>
<p>NCW was a founder and shareholder of Prenovo, EIQ, and AMI healthcare technology startups. AR has a consulting agreement with Voxel analytics LLC. and consults for Genophyll, LLC. The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
</body>
<back>
<ack><p>Experiments were performed on Armis2 HPC Clusters provided by UM&#x00027;s Precision Health Initiative. Some advice on clinical perspective on gliomas was provided by Dr. Ashok Srinivasan. We acknowledge the support from the Center for Ethics, Society, and Computing (ESC) at the University of Michigan, Ann Arbor.</p>
</ack><sec sec-type="supplementary-material" id="s11">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fnins.2021.740353/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fnins.2021.740353/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.PDF" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bajaj</surname> <given-names>A. S.</given-names></name> <name><surname>Chouhan</surname> <given-names>U.</given-names></name></person-group> (<year>2020</year>). <article-title>A review of various machine learning techniques for brain tumor detection from MRI images</article-title>. <source>Curr. Med. Imag</source>. <volume>16</volume>, <fpage>937</fpage>&#x02013;<lpage>945</lpage>. <pub-id pub-id-type="doi">10.2174/1573405615666190903144419</pub-id><pub-id pub-id-type="pmid">33081656</pub-id></citation></ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bakas</surname> <given-names>S.</given-names></name> <name><surname>Akbari</surname> <given-names>H.</given-names></name> <name><surname>Sotiras</surname> <given-names>A.</given-names></name> <name><surname>Bilello</surname> <given-names>M.</given-names></name> <name><surname>Rozycki</surname> <given-names>M.</given-names></name> <name><surname>Kirby</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2017a</year>). <article-title>Segmentation labels for the pre-operative scans of the TCGA-GBM collection</article-title>. [Data Set]. <source>The Cancer Imaging Archive</source>. <pub-id pub-id-type="doi">10.7937/K9/TCIA.2017.KLXWJJ1Q</pub-id></citation></ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bakas</surname> <given-names>S.</given-names></name> <name><surname>Akbari</surname> <given-names>H.</given-names></name> <name><surname>Sotiras</surname> <given-names>A.</given-names></name> <name><surname>Bilello</surname> <given-names>M.</given-names></name> <name><surname>Rozycki</surname> <given-names>M.</given-names></name> <name><surname>Kirby</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2017b</year>). <article-title>Segmentation labels for the pre-operative scans of the TCGA-LGG collection</article-title>. [Data Set]. <source>The Cancer Imaging Archive</source>. <pub-id pub-id-type="doi">10.7937/K9/TCIA.2017.GJQ7R0EF</pub-id></citation></ref>
<ref id="B4">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bakas</surname> <given-names>S.</given-names></name> <name><surname>Akbari</surname> <given-names>H.</given-names></name> <name><surname>Sotiras</surname> <given-names>A.</given-names></name> <name><surname>Bilello</surname> <given-names>M.</given-names></name> <name><surname>Rozycki</surname> <given-names>M.</given-names></name> <name><surname>Kirby</surname> <given-names>J. S.</given-names></name> <etal/></person-group>. (<year>2017c</year>). <article-title>Advancing the Cancer Genome Atlas glioma MRI collections with expert segmentation labels and radiomic features</article-title>. <source>Sci. Data</source> <volume>4</volume>:<fpage>170117</fpage>. <pub-id pub-id-type="doi">10.1038/sdata.2017.117</pub-id><pub-id pub-id-type="pmid">28872634</pub-id></citation></ref>
<ref id="B5">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Beede</surname> <given-names>E.</given-names></name> <name><surname>Baylor</surname> <given-names>E.</given-names></name> <name><surname>Hersch</surname> <given-names>F.</given-names></name> <name><surname>Iurchenko</surname> <given-names>A.</given-names></name> <name><surname>Wilcox</surname> <given-names>L.</given-names></name> <name><surname>Ruamviboonsuk</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>A human-centered evaluation of a deep learning system deployed in clinics for the detection of diabetic retinopathy,</article-title> in <source>Proceedings of the 2020 CHI Conference on Human Factors in Computing Systems</source>, (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1145/3313831.3376718</pub-id></citation></ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Benjamini</surname> <given-names>Y.</given-names></name> <name><surname>Hochberg</surname> <given-names>Y.</given-names></name></person-group> (<year>1995</year>). <article-title>Controlling the false discovery rate: a practical and powerful approach to multiple testing</article-title>. <source>J. R. Stat. Soc. Ser. B</source> <volume>57</volume>, <fpage>289</fpage>&#x02013;<lpage>300</lpage>. <pub-id pub-id-type="doi">10.1111/j.2517-6161.1995.tb02031.x</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Brown</surname> <given-names>T. B.</given-names></name> <name><surname>Man&#x000E9;</surname> <given-names>D.</given-names></name> <name><surname>Roy</surname> <given-names>A.</given-names></name> <name><surname>Abadi</surname> <given-names>M.</given-names></name> <name><surname>Gilmer</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <source>Adversarial patch. arXiv [preprint]</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1712.09665">http://arxiv.org/abs/1712.09665</ext-link> (accessed June 6, 2021).</citation></ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bulakba&#x0015F;&#x00131;</surname> <given-names>N.</given-names></name> <name><surname>Paksoy</surname> <given-names>Y.</given-names></name></person-group> (<year>2019</year>). <article-title>Advanced imaging in adult diffusely infiltrating low-grade gliomas</article-title>. <source>Insights Imaging</source> <volume>10</volume>:<fpage>122</fpage>. <pub-id pub-id-type="doi">10.1186/s13244-019-0793-8</pub-id><pub-id pub-id-type="pmid">32323033</pub-id></citation></ref>
<ref id="B9">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Carlini</surname> <given-names>N.</given-names></name> <name><surname>Wagner</surname> <given-names>D.</given-names></name></person-group> (<year>2017</year>). <article-title>Towards evaluating the robustness of neural networks,</article-title> in <source>2017 IEEE Symposium on Security and Privacy (SP)</source> (<publisher-loc>San Jose, CA</publisher-loc>: <publisher-name>IEEE Computer Society</publisher-name>), <fpage>39</fpage>&#x02013;<lpage>57</lpage>. <pub-id pub-id-type="doi">10.1109/SP.2017.49</pub-id></citation></ref>
<ref id="B10">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Caruana</surname> <given-names>R.</given-names></name> <name><surname>Lou</surname> <given-names>Y.</given-names></name> <name><surname>Gehrke</surname> <given-names>J.</given-names></name> <name><surname>Koch</surname> <given-names>P.</given-names></name> <name><surname>Sturm</surname> <given-names>M.</given-names></name> <name><surname>Elhadad</surname> <given-names>N.</given-names></name></person-group> (<year>2015</year>). <article-title>Intelligible models for healthcare: predicting pneumonia risk and hospital 30-day readmission,</article-title> in <source>Association for Computing Machinery</source> (<publisher-loc>New York, NY</publisher-loc>), <fpage>1721</fpage>&#x02013;<lpage>1730</lpage>. <pub-id pub-id-type="doi">10.1145/2783258.2788613</pub-id></citation></ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chan</surname> <given-names>P.</given-names></name> <name><surname>Dinniwell</surname> <given-names>R.</given-names></name> <name><surname>Haider</surname> <given-names>M. A.</given-names></name> <name><surname>Cho</surname> <given-names>Y. B.</given-names></name> <name><surname>Jaffray</surname> <given-names>D.</given-names></name> <name><surname>Lockwood</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2008</year>). <article-title>Inter- and intrafractional tumor and organ movement in patients with cervical cancer undergoing radiotherapy: a cinematic-MRI point-of-interest study</article-title>. <source>Int. J. Radiat. Oncol. Biol. Phys</source>. <volume>70</volume>, <fpage>1507</fpage>&#x02013;<lpage>1515</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijrobp.2007.08.055</pub-id><pub-id pub-id-type="pmid">18164850</pub-id></citation></ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>R.</given-names></name> <name><surname>Smith-Cohn</surname> <given-names>M.</given-names></name> <name><surname>Cohen</surname> <given-names>A. L.</given-names></name> <name><surname>Colman</surname> <given-names>H.</given-names></name></person-group> (<year>2017</year>). <article-title>Glioma subclassifications and their clinical significance</article-title>. <source>Neurotherapeutics</source> <volume>14</volume>, <fpage>284</fpage>&#x02013;<lpage>297</lpage>. <pub-id pub-id-type="doi">10.1007/s13311-017-0519-x</pub-id><pub-id pub-id-type="pmid">28281173</pub-id></citation></ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Clark</surname> <given-names>K.</given-names></name> <name><surname>Vendt</surname> <given-names>B.</given-names></name> <name><surname>Smith</surname> <given-names>K.</given-names></name> <name><surname>Freymann</surname> <given-names>J.</given-names></name> <name><surname>Kirby</surname> <given-names>J.</given-names></name> <name><surname>Koppel</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2013</year>). <article-title>The cancer imaging archive (TCIA): maintaining and operating a public information repository</article-title>. <source>J. Digit. Imaging</source> <volume>26</volume>, <fpage>1045</fpage>&#x02013;<lpage>1057</lpage>. <pub-id pub-id-type="doi">10.1007/s10278-013-9622-7</pub-id><pub-id pub-id-type="pmid">23884657</pub-id></citation></ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Claus</surname> <given-names>E. B.</given-names></name> <name><surname>Walsh</surname> <given-names>K. M.</given-names></name> <name><surname>Wiencke</surname> <given-names>J.</given-names></name> <name><surname>Molinaro</surname> <given-names>A. M.</given-names></name> <name><surname>Wiemels</surname> <given-names>J. L.</given-names></name> <name><surname>Schildkraut</surname> <given-names>J. M.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Survival and low grade glioma: the emergence of genetic information</article-title>. <source>Neurosurg. Focus</source> <volume>38</volume>:<fpage>E6</fpage>. <pub-id pub-id-type="doi">10.3171/2014.10.FOCUS12367</pub-id><pub-id pub-id-type="pmid">25552286</pub-id></citation></ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dice</surname> <given-names>L. R.</given-names></name></person-group> (<year>1945</year>). <article-title>Measures of the amount of ecologic association between species</article-title>. <source>Ecology</source> <volume>26</volume>, <fpage>297</fpage>&#x02013;<lpage>302</lpage>. <pub-id pub-id-type="doi">10.2307/1932409</pub-id></citation></ref>
<ref id="B16">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Dong</surname> <given-names>H.</given-names></name> <name><surname>Supratak</surname> <given-names>A.</given-names></name> <name><surname>Mai</surname> <given-names>L.</given-names></name> <name><surname>Liu</surname> <given-names>F.</given-names></name> <name><surname>Oehmichen</surname> <given-names>A.</given-names></name> <name><surname>Yu</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>TensorLayer: a versatile library for efficient deep learning development,</article-title> in <source>Proceedings of the 25th ACM International Conference on Multimedia</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>1201</fpage>&#x02013;<lpage>1204</lpage>. <pub-id pub-id-type="doi">10.1145/3123266.3129391</pub-id></citation></ref>
<ref id="B17">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Elder</surname> <given-names>B.</given-names></name> <name><surname>Arnold</surname> <given-names>M.</given-names></name> <name><surname>Murthi</surname> <given-names>A.</given-names></name> <name><surname>Navr&#x000E1;til</surname> <given-names>J.</given-names></name></person-group> (<year>2021</year>). <source>Learning prediction intervals for model performance. <italic>arXiv [Preprint]</italic></source>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2012.08625">http://arxiv.org/abs/2012.08625</ext-link></citation></ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Forst</surname> <given-names>D. A.</given-names></name> <name><surname>Nahed</surname> <given-names>B. V.</given-names></name> <name><surname>Loeffler</surname> <given-names>J. S.</given-names></name> <name><surname>Batchelor</surname> <given-names>T. T.</given-names></name></person-group> (<year>2014</year>). <article-title>Low-grade gliomas</article-title>. <source>Oncologist</source> <volume>19</volume>, <fpage>403</fpage>&#x02013;<lpage>413</lpage>. <pub-id pub-id-type="doi">10.1634/theoncologist.2013-0345</pub-id><pub-id pub-id-type="pmid">24664484</pub-id></citation></ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Giannopoulos</surname> <given-names>S.</given-names></name> <name><surname>Kyritsis</surname> <given-names>A.</given-names></name></person-group> (<year>2010</year>). <article-title>Diagnosis and management of multifocal gliomas</article-title>. <source>Oncology</source> <volume>79</volume>, <fpage>306</fpage>&#x02013;<lpage>312</lpage>. <pub-id pub-id-type="doi">10.1159/000323492</pub-id><pub-id pub-id-type="pmid">21412017</pub-id></citation></ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Goodfellow</surname> <given-names>I. J.</given-names></name> <name><surname>Shlens</surname> <given-names>J.</given-names></name> <name><surname>Szegedy</surname> <given-names>C.</given-names></name></person-group> (<year>2015</year>). <article-title>Explaining and harnessing adversarial examples,</article-title> in <source>3rd International Conference on Learning Representations, ICLR 2015 - Conference Track Proceedings. International Conference on Learning Representations</source> (<publisher-loc>San Diego, CA</publisher-loc>).</citation></ref>
<ref id="B21">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>C.</given-names></name> <name><surname>Pleiss</surname> <given-names>G.</given-names></name> <name><surname>Sun</surname> <given-names>Y.</given-names></name> <name><surname>Weinberger</surname> <given-names>K. Q.</given-names></name></person-group> (<year>2017</year>). <article-title>On calibration of modern neural networks,</article-title> in <source>International Conference on Machine Learning</source> (<publisher-loc>Sydney, NSW</publisher-loc>: <publisher-name>PMLR</publisher-name>) <volume>70</volume>, <fpage>1321</fpage>&#x02013;<lpage>1330</lpage>.</citation></ref>
<ref id="B22">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Jungo</surname> <given-names>A.</given-names></name> <name><surname>Reyes</surname> <given-names>M.</given-names></name></person-group> (<year>2019</year>). <article-title>Assessing reliability and challenges of uncertainty estimations for medical image segmentation,</article-title> in <source>Lecture Notes in Computer Science (Including Subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)</source> (<publisher-loc>Shenzhen</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>48</fpage>&#x02013;<lpage>56</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-32245-8_6</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kabir</surname> <given-names>H. M.</given-names></name> <name><surname>Khosravi</surname> <given-names>A.</given-names></name> <name><surname>Hosen</surname> <given-names>M. A.</given-names></name> <name><surname>Nahavandi</surname> <given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>Neural network-based uncertainty quantification: a survey of methodologies and applications</article-title>. <source>IEEE Access</source> <volume>6</volume>, <fpage>36218</fpage>&#x02013;<lpage>36234</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2018.2836917</pub-id></citation></ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kairouz</surname> <given-names>P.</given-names></name> <name><surname>McMahan</surname> <given-names>H. B.</given-names></name> <name><surname>Avent</surname> <given-names>B.</given-names></name> <name><surname>Bellet</surname> <given-names>A.</given-names></name> <name><surname>Bennis</surname> <given-names>M.</given-names></name> <name><surname>Bhagoji</surname> <given-names>A. N.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Advances and Open Problems in Federated Learning</article-title>. <source>Found. Trends Mach. Learn.</source> <volume>14</volume>, <fpage>1</fpage>&#x02013;<lpage>210</lpage>. <pub-id pub-id-type="doi">10.1561/9781680837896</pub-id></citation></ref>
<ref id="B25">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Kingma</surname> <given-names>D. P.</given-names></name></person-group> (<year>2015</year>). <source>Adam: a method for stochastic optimization. arXiv [Preprint]</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1412.6980">http://arxiv.org/abs/1412.6980</ext-link>.</citation></ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kocher</surname> <given-names>M.</given-names></name> <name><surname>Ruge</surname> <given-names>M. I.</given-names></name> <name><surname>Galldiks</surname> <given-names>N.</given-names></name> <name><surname>Lohmann</surname> <given-names>P.</given-names></name></person-group> (<year>2020</year>). <article-title>Applications of radiomics and machine learning for radiotherapy of malignant brain tumors</article-title>. <source>Strahlenther. Onkol</source>. <volume>196</volume>, <fpage>856</fpage>&#x02013;<lpage>867</lpage>. <pub-id pub-id-type="doi">10.1007/s00066-020-01626-8</pub-id><pub-id pub-id-type="pmid">32394100</pub-id></citation></ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kompa</surname> <given-names>B.</given-names></name> <name><surname>Snoek</surname> <given-names>J.</given-names></name> <name><surname>Beam</surname> <given-names>A. L.</given-names></name></person-group> (<year>2021</year>). <article-title>Second opinion needed: communicating uncertainty in medical machine learning</article-title>. <source>NPJ Digit. Med</source>. <volume>4</volume>, <fpage>1</fpage>&#x02013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1038/s41746-020-00367-3</pub-id><pub-id pub-id-type="pmid">33402680</pub-id></citation></ref>
<ref id="B28">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>K&#x000FC;mmel</surname> <given-names>A.</given-names></name> <name><surname>Bonate</surname> <given-names>P. L.</given-names></name> <name><surname>Dingemanse</surname> <given-names>J.</given-names></name> <name><surname>Krause</surname> <given-names>A.</given-names></name></person-group> (<year>2018</year>). <article-title>Confidence and prediction intervals for pharmacometric models</article-title>. <source>Pharmacometr. Syst. Pharmacol</source>. <volume>7</volume>, <fpage>360</fpage>&#x02013;<lpage>373</lpage>. <pub-id pub-id-type="doi">10.1002/psp4.12286</pub-id><pub-id pub-id-type="pmid">29388347</pub-id></citation></ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Larsen</surname> <given-names>J.</given-names></name> <name><surname>Wharton</surname> <given-names>S. B.</given-names></name> <name><surname>McKevitt</surname> <given-names>F.</given-names></name> <name><surname>Romanowski</surname> <given-names>C.</given-names></name> <name><surname>Bridgewater</surname> <given-names>C.</given-names></name> <name><surname>Zaki</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>&#x00027;Low grade glioma&#x00027;: an update for radiologists</article-title>. <source>Br. J. Radiol</source>. <volume>90</volume>:<fpage>1070</fpage>. <pub-id pub-id-type="doi">10.1259/bjr.20160600</pub-id><pub-id pub-id-type="pmid">27925467</pub-id></citation></ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Leek</surname> <given-names>J. T.</given-names></name> <name><surname>Scharpf</surname> <given-names>R. B.</given-names></name> <name><surname>Bravo</surname> <given-names>H. C.</given-names></name> <name><surname>Simcha</surname> <given-names>D.</given-names></name> <name><surname>Langmead</surname> <given-names>B.</given-names></name> <name><surname>Johnson</surname> <given-names>W. E.</given-names></name> <etal/></person-group>. (<year>2010</year>). <article-title>Tackling the widespread and critical impact of batch effects in high-throughput data</article-title>. <source>Nat. Rev. Genet</source>. <volume>11</volume>, <fpage>733</fpage>&#x02013;<lpage>739</lpage>. <pub-id pub-id-type="doi">10.1038/nrg2825</pub-id><pub-id pub-id-type="pmid">20838408</pub-id></citation></ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Louis</surname> <given-names>D. N.</given-names></name> <name><surname>Perry</surname> <given-names>A.</given-names></name> <name><surname>Reifenberger</surname> <given-names>G.</given-names></name> <name><surname>von Deimling</surname> <given-names>A.</given-names></name> <name><surname>Figarella-Branger</surname> <given-names>D.</given-names></name> <name><surname>Cavenee</surname> <given-names>W. K.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>The 2016 World Health Organization classification of tumors of the central nervous system: a summary</article-title>. <source>Acta Neuropathol</source>. <volume>131</volume>, <fpage>803</fpage>&#x02013;<lpage>820</lpage>. <pub-id pub-id-type="doi">10.1007/s00401-016-1545-1</pub-id><pub-id pub-id-type="pmid">27157931</pub-id></citation></ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Maier-Hein</surname> <given-names>L.</given-names></name> <name><surname>Eisenmann</surname> <given-names>M.</given-names></name> <name><surname>Reinke</surname> <given-names>A.</given-names></name> <name><surname>Onogur</surname> <given-names>S.</given-names></name> <name><surname>Stankovic</surname> <given-names>M.</given-names></name> <name><surname>Scholz</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Why rankings of biomedical image analysis competitions should be interpreted with care</article-title>. <source>Nat. Commun</source>. <volume>9</volume>, <fpage>1</fpage>&#x02013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1038/s41467-018-07619-7</pub-id><pub-id pub-id-type="pmid">30700735</pub-id></citation></ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>McKinney</surname> <given-names>S. M.</given-names></name> <name><surname>Sieniek</surname> <given-names>M.</given-names></name> <name><surname>Godbole</surname> <given-names>V.</given-names></name> <name><surname>Godwin</surname> <given-names>J.</given-names></name> <name><surname>Antropova</surname> <given-names>N.</given-names></name> <name><surname>Ashrafian</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>International evaluation of an AI system for breast cancer screening</article-title>. <source>Nature</source> <volume>577</volume>, <fpage>89</fpage>&#x02013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-019-1799-6</pub-id><pub-id pub-id-type="pmid">33057216</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mehrtash</surname> <given-names>A.</given-names></name> <name><surname>Wells</surname> <given-names>W. M.</given-names></name> <name><surname>Tempany</surname> <given-names>C. M.</given-names></name> <name><surname>Abolmaesumi</surname> <given-names>P.</given-names></name> <name><surname>Kapur</surname> <given-names>T.</given-names></name></person-group> (<year>2020</year>). <article-title>Confidence calibration and predictive uncertainty estimation for deep medical image segmentation</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>39</volume>, <fpage>3868</fpage>&#x02013;<lpage>3878</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2020.3006437</pub-id><pub-id pub-id-type="pmid">32746129</pub-id></citation></ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Menze</surname> <given-names>B. H.</given-names></name> <name><surname>Jakab</surname> <given-names>A.</given-names></name> <name><surname>Bauer</surname> <given-names>S.</given-names></name> <name><surname>Kalpathy-Cramer</surname> <given-names>J.</given-names></name> <name><surname>Farahani</surname> <given-names>K.</given-names></name> <name><surname>Kirby</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>The multimodal brain tumor image segmentation benchmark (BRATS)</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>34</volume>, <fpage>1993</fpage>&#x02013;<lpage>2024</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2014.2377694</pub-id><pub-id pub-id-type="pmid">25494501</pub-id></citation></ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nagendran</surname> <given-names>M.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Lovejoy</surname> <given-names>C. A.</given-names></name> <name><surname>Gordon</surname> <given-names>A. C.</given-names></name> <name><surname>Komorowski</surname> <given-names>M.</given-names></name> <name><surname>Harvey</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Artificial intelligence versus clinicians: systematic review of design, reporting standards, and claims of deep learning studies in medical imaging</article-title>. <source>BMJ</source> <volume>368</volume>:<fpage>m689</fpage>. <pub-id pub-id-type="doi">10.1136/bmj.m689</pub-id><pub-id pub-id-type="pmid">32213531</pub-id></citation></ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nazar</surname> <given-names>U.</given-names></name> <name><surname>Khan</surname> <given-names>M. A.</given-names></name> <name><surname>Lali</surname> <given-names>I. U.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name> <name><surname>Ali</surname> <given-names>H.</given-names></name> <name><surname>Ashraf</surname> <given-names>I.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Review of automated computerized methods for brain tumor segmentation and classification</article-title>. <source>Curr. Med. Imaging</source> <volume>16</volume>, <fpage>823</fpage>&#x02013;<lpage>834</lpage>. <pub-id pub-id-type="doi">10.2174/1573405615666191120110855</pub-id><pub-id pub-id-type="pmid">33059553</pub-id></citation></ref>
<ref id="B38">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Ojika</surname> <given-names>D.</given-names></name> <name><surname>Patel</surname> <given-names>B.</given-names></name> <name><surname>Reina</surname> <given-names>G. A.</given-names></name> <name><surname>Boyer</surname> <given-names>T.</given-names></name> <name><surname>Martin</surname> <given-names>C.</given-names></name> <name><surname>Shah</surname> <given-names>P.</given-names></name></person-group> (<year>2020</year>). <article-title>Addressing the memory bottleneck in AI model training</article-title>. <source>arXiv [Preprint]</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2003.08732">http://arxiv.org/abs/2003.08732</ext-link>.</citation></ref>
<ref id="B39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Platt</surname> <given-names>J. C.</given-names></name></person-group> (<year>1999</year>). <article-title>Probabilistic outputs for support vector machines and comparisons to regularized likelihood methods</article-title>. <source>Adv. Large Margin Classif</source>. <volume>10</volume>, <fpage>61</fpage>&#x02013;<lpage>74</lpage>.</citation></ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rebsamen</surname> <given-names>M.</given-names></name> <name><surname>Knecht</surname> <given-names>U.</given-names></name> <name><surname>Reyes</surname> <given-names>M.</given-names></name> <name><surname>Wiest</surname> <given-names>R.</given-names></name> <name><surname>Meier</surname> <given-names>R.</given-names></name> <name><surname>McKinley</surname> <given-names>R.</given-names></name></person-group> (<year>2019</year>). <article-title>Divide and conquer: stratifying training data by tumor grade improves deep learning-based brain tumor segmentation</article-title>. <source>Front. Neurosci</source>. <volume>13</volume>:<fpage>1182</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2019.01182</pub-id><pub-id pub-id-type="pmid">31749678</pub-id></citation></ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Recht</surname> <given-names>M. P.</given-names></name> <name><surname>Dewey</surname> <given-names>M.</given-names></name> <name><surname>Dreyer</surname> <given-names>K.</given-names></name> <name><surname>Langlotz</surname> <given-names>C.</given-names></name> <name><surname>Niessen</surname> <given-names>W.</given-names></name> <name><surname>Prainsack</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Integrating artificial intelligence into the clinical practice of radiology: challenges and recommendations</article-title>. <source>Eur. Radiol</source>. <volume>30</volume>, <fpage>3576</fpage>&#x02013;<lpage>3584</lpage>. <pub-id pub-id-type="doi">10.1007/s00330-020-06672-5</pub-id><pub-id pub-id-type="pmid">32064565</pub-id></citation></ref>
<ref id="B42">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Reinke</surname> <given-names>A.</given-names></name> <name><surname>Eisenmann</surname> <given-names>M.</given-names></name> <name><surname>Tizabi</surname> <given-names>M. D.</given-names></name> <name><surname>Sudre</surname> <given-names>C. H.</given-names></name> <name><surname>R&#x000E4;dsch</surname> <given-names>T.</given-names></name> <name><surname>Antonelli</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Common limitations of image processing metrics: a picture story</article-title>. <source>arXiv [Preprint]</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2104.05642">http://arxiv.org/abs/2104.05642</ext-link>.</citation></ref>
<ref id="B43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ren</surname> <given-names>K.</given-names></name> <name><surname>Zheng</surname> <given-names>T.</given-names></name> <name><surname>Qin</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name></person-group> (<year>2020</year>). <article-title>Adversarial attacks and defenses in deep learning</article-title>. <source>Engineering</source> <volume>6</volume>, <fpage>346</fpage>&#x02013;<lpage>360</lpage>. <pub-id pub-id-type="doi">10.1016/j.eng.2019.12.012</pub-id><pub-id pub-id-type="pmid">30640631</pub-id></citation></ref>
<ref id="B44">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Renard</surname> <given-names>F.</given-names></name> <name><surname>Guedria</surname> <given-names>S.</given-names></name> <name><surname>Palma</surname> <given-names>N. D.</given-names></name> <name><surname>Vuillerme</surname> <given-names>N.</given-names></name></person-group> (<year>2020</year>). <article-title>Variability and reproducibility in deep learning for medical image segmentation</article-title>. <source>Sci. Rep</source>. <volume>10</volume>, <fpage>1</fpage>&#x02013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1038/s41598-020-69920-0</pub-id><pub-id pub-id-type="pmid">32792540</pub-id></citation></ref>
<ref id="B45">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Romano</surname> <given-names>Y.</given-names></name> <name><surname>Patterson</surname> <given-names>E.</given-names></name> <name><surname>Cand&#x000E9;s</surname> <given-names>E. J.</given-names></name></person-group> (<year>2019</year>). <source>Conformalized quantile regression. arXiv [Preprint]</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1905.03222">http://arxiv.org/abs/1905.03222</ext-link>.</citation></ref>
<ref id="B46">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ronneberger</surname> <given-names>O.</given-names></name> <name><surname>Fischer</surname> <given-names>P.</given-names></name> <name><surname>Brox</surname> <given-names>T.</given-names></name></person-group> (<year>2015</year>). <article-title>U-Net: convolutional networks for biomedical image segmentation,</article-title> in <source>Lecture Notes in Computer Science (Including Subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics), Vol. 9351</source> (<publisher-loc>Munich</publisher-loc>: <publisher-name>Springer Verlag</publisher-name>), <fpage>234</fpage>&#x02013;<lpage>241</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-24574-4_28</pub-id></citation></ref>
<ref id="B47">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Rousseau</surname> <given-names>A.-J.</given-names></name> <name><surname>Becker</surname> <given-names>T.</given-names></name> <name><surname>Bertels</surname> <given-names>J.</given-names></name> <name><surname>Blaschko</surname> <given-names>M. B.</given-names></name> <name><surname>Valkenborg</surname> <given-names>D.</given-names></name></person-group> (<year>2021</year>). <article-title>Post training uncertainty calibration of deep networks for medical image segmentation,</article-title> in <source>2021 IEEE 18th International Symposium on Biomedical Imaging (ISBI)</source> (<publisher-loc>Nice</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1052</fpage>&#x02013;<lpage>1056</lpage>. <pub-id pub-id-type="doi">10.1109/ISBI48211.2021.9434131</pub-id></citation></ref>
<ref id="B48">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Sadri</surname> <given-names>A. R.</given-names></name> <name><surname>Janowczyk</surname> <given-names>A.</given-names></name> <name><surname>Zou</surname> <given-names>R.</given-names></name> <name><surname>Verma</surname> <given-names>R.</given-names></name> <name><surname>Antunes</surname> <given-names>J.</given-names></name> <name><surname>Madabhushi</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>MRQy: an open-source tool for quality control of MR imaging data</article-title>. <source>arXiv [Preprint]</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2004.04871">http://arxiv.org/abs/2004.04871</ext-link>.<pub-id pub-id-type="pmid">33176026</pub-id></citation></ref>
<ref id="B49">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Seabold</surname> <given-names>S.</given-names></name> <name><surname>Perktold</surname> <given-names>J.</given-names></name></person-group> (<year>2010</year>). <article-title>statsmodels: Econometric and statistical modeling with python,</article-title> in <source>9th Python in Science Conference</source> (<publisher-loc>Austin, TX</publisher-loc>), <fpage>92</fpage>&#x02013;<lpage>96</lpage>. <pub-id pub-id-type="doi">10.25080/Majora-92bf1922-011</pub-id></citation></ref>
<ref id="B50">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sheller</surname> <given-names>M. J.</given-names></name> <name><surname>Edwards</surname> <given-names>B.</given-names></name> <name><surname>Reina</surname> <given-names>G. A.</given-names></name> <name><surname>Martin</surname> <given-names>J.</given-names></name> <name><surname>Pati</surname> <given-names>S.</given-names></name> <name><surname>Kotrotsou</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Federated learning in medicine: facilitating multi-institutional collaborations without sharing patient data</article-title>. <source>Sci. Rep</source>. <volume>10</volume>:<fpage>12598</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-020-69250-1</pub-id><pub-id pub-id-type="pmid">32724046</pub-id></citation></ref>
<ref id="B51">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Steiner</surname> <given-names>D. F.</given-names></name> <name><surname>Chen</surname> <given-names>P.-H. C.</given-names></name> <name><surname>Mermel</surname> <given-names>C. H.</given-names></name></person-group> (<year>2021</year>). <article-title>Closing the translation gap: AI applications in digital pathology</article-title>. <source>Biochim. Biophys. Acta</source> <volume>1875</volume>:<fpage>188452</fpage>. <pub-id pub-id-type="doi">10.1016/j.bbcan.2020.188452</pub-id><pub-id pub-id-type="pmid">33065195</pub-id></citation></ref>
<ref id="B52">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Strickland</surname> <given-names>E.</given-names></name></person-group> (<year>2019</year>). <article-title>IBM Watson, heal thyself: how IBM overpromised and underdelivered on AI health care</article-title>. <source>IEEE Spectr</source>. <volume>56</volume>, <fpage>24</fpage>&#x02013;<lpage>31</lpage>. <pub-id pub-id-type="doi">10.1109/MSPEC.2019.8678513</pub-id></citation></ref>
<ref id="B53">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Taha</surname> <given-names>A. A.</given-names></name> <name><surname>Hanbury</surname> <given-names>A.</given-names></name></person-group> (<year>2015</year>). <article-title>Metrics for evaluating 3D medical image segmentation: analysis, selection, and tool</article-title>. <source>BMC Med. Imaging</source> <volume>15</volume>:<fpage>29</fpage>. <pub-id pub-id-type="doi">10.1186/s12880-015-0068-x</pub-id><pub-id pub-id-type="pmid">26263899</pub-id></citation></ref>
<ref id="B54">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Tamimi</surname> <given-names>A. F.</given-names></name> <name><surname>Juweid</surname> <given-names>M.</given-names></name></person-group> (<year>2017</year>). <article-title>Epidemiology and outcome of glioblastoma,</article-title> in <source>Glioblastoma</source>, ed <person-group person-group-type="editor"><name><surname>De Vleeschouwer</surname> <given-names>S.</given-names></name></person-group> (<publisher-loc>Brisbane, AU</publisher-loc>: <publisher-name>Codon Publications</publisher-name>), <fpage>143</fpage>&#x02013;<lpage>153</lpage>. <pub-id pub-id-type="doi">10.15586/codon.glioblastoma.2017.ch8</pub-id></citation></ref>
<ref id="B55">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tan</surname> <given-names>A. C.</given-names></name> <name><surname>Ashley</surname> <given-names>D. M.</given-names></name> <name><surname>L&#x000F3;pez</surname> <given-names>G. Y.</given-names></name> <name><surname>Malinzak</surname> <given-names>M.</given-names></name> <name><surname>Friedman</surname> <given-names>H. S.</given-names></name> <name><surname>Khasraw</surname> <given-names>M.</given-names></name></person-group> (<year>2020</year>). <article-title>Management of glioblastoma: state of the art and future directions</article-title>. <source>Cancer J. Clin</source>. <volume>70</volume>, <fpage>299</fpage>&#x02013;<lpage>312</lpage>. <pub-id pub-id-type="doi">10.3322/caac.21613</pub-id><pub-id pub-id-type="pmid">32478924</pub-id></citation></ref>
<ref id="B56">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tipping</surname> <given-names>M. E.</given-names></name> <name><surname>Bishop</surname> <given-names>C. M.</given-names></name></person-group> (<year>1999</year>). <article-title>Mixtures of probabilistic principal component analyzers</article-title>. <source>Neural Comput</source>. <volume>11</volume>, <fpage>443</fpage>&#x02013;<lpage>482</lpage>. <pub-id pub-id-type="doi">10.1162/089976699300016728</pub-id><pub-id pub-id-type="pmid">9950739</pub-id></citation></ref>
<ref id="B57">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Udupa</surname> <given-names>J. K.</given-names></name> <name><surname>LeBlanc</surname> <given-names>V. R.</given-names></name> <name><surname>Zhuge</surname> <given-names>Y.</given-names></name> <name><surname>Imielinska</surname> <given-names>C.</given-names></name> <name><surname>Schmidt</surname> <given-names>H.</given-names></name> <name><surname>Currie</surname> <given-names>L. M.</given-names></name> <etal/></person-group>. (<year>2006</year>). <article-title>A framework for evaluating image segmentation algorithms</article-title>. <source>Comput. Med. Imaging Graph</source>. <volume>30</volume>, <fpage>75</fpage>&#x02013;<lpage>87</lpage>. <pub-id pub-id-type="doi">10.1016/j.compmedimag.2005.12.001</pub-id><pub-id pub-id-type="pmid">16584976</pub-id></citation></ref>
<ref id="B58">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>van Kempen</surname> <given-names>E. J.</given-names></name> <name><surname>Post</surname> <given-names>M.</given-names></name> <name><surname>Mannil</surname> <given-names>M.</given-names></name> <name><surname>Witkam</surname> <given-names>R. L.</given-names></name> <name><surname>ter Laan</surname> <given-names>M.</given-names></name> <name><surname>Patel</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Performance of machine learning algorithms for glioma segmentation of brain MRI: a systematic literature review and meta-analysis</article-title>. <source>Eur. Radiol.</source> <pub-id pub-id-type="doi">10.1007/s00330-021-08035-0</pub-id><pub-id pub-id-type="pmid">34019128</pub-id></citation></ref>
<ref id="B59">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wadhwa</surname> <given-names>A.</given-names></name> <name><surname>Bhardwaj</surname> <given-names>A.</given-names></name> <name><surname>Singh Verma</surname> <given-names>V.</given-names></name></person-group> (<year>2019</year>). <article-title>A review on brain tumor segmentation of MRI images</article-title>. <source>Magn. Reson. Imaging</source> <volume>61</volume>, <fpage>247</fpage>&#x02013;<lpage>259</lpage>. <pub-id pub-id-type="doi">10.1016/j.mri.2019.05.043</pub-id><pub-id pub-id-type="pmid">31200024</pub-id></citation></ref>
<ref id="B60">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wetstein</surname> <given-names>S.</given-names></name> <name><surname>Gonz&#x000E1;lez-Gonzalo</surname> <given-names>C.</given-names></name> <name><surname>Bortsova</surname> <given-names>G.</given-names></name> <name><surname>Liefers</surname> <given-names>B.</given-names></name> <name><surname>Dubost</surname> <given-names>F.</given-names></name> <name><surname>Katramados</surname> <given-names>I.</given-names></name> <etal/></person-group>. (<year>2020</year>). Adversarial attack vulnerability of medical image analysis systems: unexplored factors.</citation></ref>
<ref id="B61">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Whittle</surname> <given-names>I. R.</given-names></name></person-group> (<year>2004</year>). <article-title>The dilemma of low grade glioma</article-title>. <source>J. Neurol. Neurosurg. Psychiatry</source> <volume>75</volume>, <fpage>31</fpage>&#x02013;<lpage>36</lpage>. <pub-id pub-id-type="doi">10.1136/jnnp.2004.040501</pub-id><pub-id pub-id-type="pmid">15146037</pub-id></citation></ref>
<ref id="B62">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Witthayanuwat</surname> <given-names>S.</given-names></name> <name><surname>Pesee</surname> <given-names>M.</given-names></name> <name><surname>Supaadirek</surname> <given-names>C.</given-names></name> <name><surname>Supakalin</surname> <given-names>N.</given-names></name> <name><surname>Thamronganantasakul</surname> <given-names>K.</given-names></name> <name><surname>Krusun</surname> <given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>Survival analysis of glioblastoma multiforme</article-title>. <source>Asian Pac. J. Cancer Prevent</source>. <volume>19</volume>, <fpage>2613</fpage>&#x02013;<lpage>2617</lpage>.</citation></ref>
<ref id="B63">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Woods</surname> <given-names>W.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Teuscher</surname> <given-names>C.</given-names></name></person-group> (<year>2019</year>). <article-title>Adversarial explanations for understanding image classification decisions and improved neural network robustness</article-title>. <source>Nat. Mach. Intell</source>. <volume>1</volume>, <fpage>508</fpage>&#x02013;<lpage>516</lpage>. <pub-id pub-id-type="doi">10.1038/s42256-019-0104-6</pub-id></citation></ref>
<ref id="B64">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yeghiazaryan</surname> <given-names>V.</given-names></name> <name><surname>Voiculescu</surname> <given-names>I.</given-names></name></person-group> (<year>2018</year>). <article-title>Family of boundary overlap metrics for the evaluation of medical image segmentation</article-title>. <source>J. Med. Imaging</source> <volume>5</volume>:<fpage>1</fpage>. <pub-id pub-id-type="doi">10.1117/1.JMI.5.1.015006</pub-id><pub-id pub-id-type="pmid">29487883</pub-id></citation></ref>
<ref id="B65">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zech</surname> <given-names>J. R.</given-names></name> <name><surname>Badgeley</surname> <given-names>M. A.</given-names></name> <name><surname>Liu</surname> <given-names>M.</given-names></name> <name><surname>Costa</surname> <given-names>A. B.</given-names></name> <name><surname>Titano</surname> <given-names>J. J.</given-names></name> <name><surname>Oermann</surname> <given-names>E. K.</given-names></name></person-group> (<year>2018</year>). <article-title>Variable generalization performance of a deep learning model to detect pneumonia in chest radiographs: a cross-sectional study</article-title>. <source>PLoS Med</source>. <volume>15</volume>:<fpage>e1002683</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pmed.1002683</pub-id><pub-id pub-id-type="pmid">30399157</pub-id></citation></ref>
<ref id="B66">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zwanenburg</surname> <given-names>A.</given-names></name> <name><surname>Leger</surname> <given-names>S.</given-names></name> <name><surname>Agolli</surname> <given-names>L.</given-names></name> <name><surname>Pilz</surname> <given-names>K.</given-names></name> <name><surname>Troost</surname> <given-names>E. G. C.</given-names></name> <name><surname>Richter</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Assessing robustness of radiomic features by image perturbation</article-title>. <source>Sci. Rep</source>. <volume>9</volume>, <fpage>1</fpage>&#x02013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1038/s41598-018-36938-4</pub-id><pub-id pub-id-type="pmid">30679599</pub-id></citation></ref>
</ref-list> 
</back>
</article>