<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Big Data</journal-id>
<journal-title>Frontiers in Big Data</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Big Data</abbrev-journal-title>
<issn pub-type="epub">2624-909X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fdata.2022.803685</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Big Data</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Improving Variational Autoencoders for New Physics Detection at the LHC With Normalizing Flows</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Jawahar</surname> <given-names>Pratik</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1531274/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Aarrestad</surname> <given-names>Thea</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1329608/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Chernyavskaya</surname> <given-names>Nadezda</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1656061/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Pierini</surname> <given-names>Maurizio</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/706738/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Wozniak</surname> <given-names>Kinga A.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1168133/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Ngadiuba</surname> <given-names>Jennifer</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1168050/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Duarte</surname> <given-names>Javier</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/1041469/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Tsan</surname> <given-names>Steven</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Experimental Physics Department, European Center for Nuclear Research (CERN)</institution>, <addr-line>Geneva</addr-line>, <country>Switzerland</country></aff>
<aff id="aff2"><sup>2</sup><institution>Faculty of Computer Science, University of Vienna</institution>, <addr-line>Vienna</addr-line>, <country>Austria</country></aff>
<aff id="aff3"><sup>3</sup><institution>Particle Physics Division, Fermi National Accelerator Laboratory (FNAL)</institution>, <addr-line>Batavia, IL</addr-line>, <country>United States</country></aff>
<aff id="aff4"><sup>4</sup><institution>Lauritsen Laboratory of High Energy Physics, California Institute of Technology</institution>, <addr-line>Pasadena, CA</addr-line>, <country>United States</country></aff>
<aff id="aff5"><sup>5</sup><institution>Department of Physics, University of California, San Diego</institution>, <addr-line>San Diego, CA</addr-line>, <country>United States</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Michela Paganini, Facebook, United States</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Alexander Radovic, Borealis AI, Canada; Tobias Golling, Universit&#x000E9; de Gen&#x000E8;ve, Switzerland</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Pratik Jawahar <email>pjawahar&#x00040;wpi.edu</email></corresp>
<fn fn-type="other" id="fn001"><p>This article was submitted to Big Data and AI in High Energy Physics, a section of the journal Frontiers in Big Data</p></fn></author-notes>
<pub-date pub-type="epub">
<day>28</day>
<month>02</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>5</volume>
<elocation-id>803685</elocation-id>
<history>
<date date-type="received">
<day>28</day>
<month>10</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>17</day>
<month>01</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2022 Jawahar, Aarrestad, Chernyavskaya, Pierini, Wozniak, Ngadiuba, Duarte and Tsan.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Jawahar, Aarrestad, Chernyavskaya, Pierini, Wozniak, Ngadiuba, Duarte and Tsan</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license> </permissions>
<abstract>
<p>We investigate how to improve new physics detection strategies exploiting variational autoencoders and normalizing flows for anomaly detection at the Large Hadron Collider. As a working example, we consider the DarkMachines challenge dataset. We show how different design choices (e.g., event representations, anomaly score definitions, network architectures) affect the result on specific benchmark new physics models. Once a baseline is established, we discuss how to improve the anomaly detection accuracy by exploiting normalizing flow layers in the latent space of the variational autoencoder.</p></abstract>
<kwd-group>
<kwd>anomaly detection (AD)</kwd>
<kwd>variational auto encoder (VAE)</kwd>
<kwd>normalizing flow (NF)</kwd>
<kwd>Large Hadron Collider (LHC)</kwd>
<kwd>new physics beyond standard model</kwd>
<kwd>graph convolutional network (GCN)</kwd>
<kwd>convolutional neural net</kwd>
</kwd-group>
<counts>
<fig-count count="7"/>
<table-count count="2"/>
<equation-count count="8"/>
<ref-count count="66"/>
<page-count count="12"/>
<word-count count="7816"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1. Introduction</title>
<p>Most searches for new physics at the CERN Large Hadron Collider (LHC) target specific experimental signatures. The underlying assumption of a specific new physics model could enter at various stages in the search design, e.g., when reducing the data rate from 40 M to 1,000 collision events per second in real time (Trocino, <xref ref-type="bibr" rid="B62">2014</xref>; Aad et al., <xref ref-type="bibr" rid="B2">2020</xref>; Sirunyan et al., <xref ref-type="bibr" rid="B58">2020</xref>), when designing the event selection, or when running the final hypothesis testing. When searching for pre-established and theoretically well-motivated particles (e.g., the Higgs boson), this strategy is extremely successful because the underlying assumption can be exploited to maximize the search sensitivity. On the other hand, the lack of a predefined target might turn this strength into a limitation.</p>
<p>To compensate for this potential problem, <italic>model independent</italic> searches are also carried out (Aaltonen et al., <xref ref-type="bibr" rid="B3">2009</xref>; Aaron et al., <xref ref-type="bibr" rid="B4">2009</xref>; D0 Collaboration, <xref ref-type="bibr" rid="B21">2012</xref>; CMS-PAS-EXO-14-016, <xref ref-type="bibr" rid="B17">2017</xref>; Aaboud et al., <xref ref-type="bibr" rid="B1">2019</xref>) at hadron colliders. These searches consist in an extensive set of comparisons between the data distribution and the expectation derived from Monte Carlo simulation. Many comparisons are carried out in parallel for multiple physics-motivated features while applying different event selections. However, when searching for new physics among many channels, the &#x0201C;global&#x0201D; significance of observing a particular discrepancy must take into account the probability of observing such a discrepancy anywhere. This so called look-elsewhere effect can be quantified in terms of a trial factor (Gross and Vitells, <xref ref-type="bibr" rid="B34">2010</xref>). While the large trial factor typically reduces the statistical power of this strategy in terms of significance, model independent searches are valuable tools to identify possible regions of interest and provide data-driven motivations for traditional, more targeted searches to be performed on future data.</p>
<p>Recently, the use of machine learning techniques has been advocated as a mean to reduce the model dependence (Weisser and Williams, <xref ref-type="bibr" rid="B63">2016</xref>; Collins et al., <xref ref-type="bibr" rid="B18">2018</xref>, <xref ref-type="bibr" rid="B19">2019</xref>, <xref ref-type="bibr" rid="B20">2021</xref>; Blance et al., <xref ref-type="bibr" rid="B11">2019</xref>; Cerri et al., <xref ref-type="bibr" rid="B15">2019</xref>; D&#x00027;Agnolo and Wulzer, <xref ref-type="bibr" rid="B23">2019</xref>; De Simone and Jacques, <xref ref-type="bibr" rid="B25">2019</xref>; Heimel et al., <xref ref-type="bibr" rid="B37">2019</xref>; Andreassen et al., <xref ref-type="bibr" rid="B8">2020</xref>; Cheng et al., <xref ref-type="bibr" rid="B16">2020</xref>; Dillon et al., <xref ref-type="bibr" rid="B26">2020</xref>; Farina et al., <xref ref-type="bibr" rid="B30">2020</xref>; Hajer et al., <xref ref-type="bibr" rid="B35">2020</xref>; Khosa and Sanz, <xref ref-type="bibr" rid="B41">2020</xref>; Nachman, <xref ref-type="bibr" rid="B48">2020</xref>; Nachman and Shih, <xref ref-type="bibr" rid="B49">2020</xref>; Park et al., <xref ref-type="bibr" rid="B52">2020</xref>; Amram and Suarez, <xref ref-type="bibr" rid="B6">2021</xref>; Bortolato et al., <xref ref-type="bibr" rid="B12">2021</xref>; D&#x00027;Agnolo et al., <xref ref-type="bibr" rid="B22">2021</xref>; Finke et al., <xref ref-type="bibr" rid="B31">2021</xref>; Gonski et al., <xref ref-type="bibr" rid="B33">2021</xref>; Hallin et al., <xref ref-type="bibr" rid="B36">2021</xref>; Ostdiek, <xref ref-type="bibr" rid="B50">2021</xref>). In this context, the particle-physics community engaged in two data challenges: the LHC Olympics 2020 (Kasieczka et al., <xref ref-type="bibr" rid="B40">2021</xref>) and the DarkMachines challenge (Aarrestad et al., <xref ref-type="bibr" rid="B5">2021</xref>), where different approaches were explored to attempt to detect an unknown signal of new physics hidden in simulated data.</p>
<p>As part of our contribution to the DarkMachines challenge, we investigated the use of a particle-based variational autoencoder (VAE) (Kingma and Welling, <xref ref-type="bibr" rid="B44">2014</xref>; Rezende et al., <xref ref-type="bibr" rid="B56">2014</xref>) and the possibility of enhancing its anomaly detection capability by using normalizing flows (NFs) (Papamakarios et al., <xref ref-type="bibr" rid="B51">2021</xref>) in the latent space to optimize the choice of the latent-space prior. In this article, we document those studies and expand that effort, investigating the impact of specific architecture choices (event representation, network architecture, usage of expert features, and definition of the anomaly score). This study is an update of our contribution to the DarkMachine challenge (Aarrestad et al., <xref ref-type="bibr" rid="B5">2021</xref>), which benefits from the lessons learned by the DarkMachines challenge. Taking inspiration from solutions presented by other groups in the challenge (e.g., Caron et al., <xref ref-type="bibr" rid="B14">2021</xref>; Ostdiek, <xref ref-type="bibr" rid="B50">2021</xref>), we evaluate the impact of some of their findings on our specific setup. In some cases (but not always), these solutions translate in an improved performance, quantified using the same metrics presented in Aarrestad et al. (<xref ref-type="bibr" rid="B5">2021</xref>). In this way, we establish an improved baseline model, on top of which we evaluate the impact of the normalizing flow layers in the latent space.</p></sec>
<sec id="s2">
<title>2. Data Samples and Event Representation</title>
<p>This study is based on the datasets released on the Zenodo platform (DarkMachines Community, <xref ref-type="bibr" rid="B24">2020</xref>) in relation to the Dark Machines Anomaly Score Challenge (Aarrestad et al., <xref ref-type="bibr" rid="B5">2021</xref>). They consist of a set of processes predicted in the standard model (SM) of particle physics, mixed according to their production cross section in proton-proton collisions at 13TeV center-of-mass energy, and a set of benchmark signal samples. The datasets contains labels, identifying the process that generated each event. Labels are ignored during training and used to evaluate performance metrics.</p>
<p>For each sample, four datasets are provided, corresponding to four different event selections (called <italic>channels</italic>; Aarrestad et al., <xref ref-type="bibr" rid="B5">2021</xref>):</p>
<list list-type="bullet">
<list-item><p>Channel 1: <italic>H</italic><sub>T</sub>&#x02265;600GeV, <inline-formula><mml:math id="M1"><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">miss</mml:mtext></mml:mstyle></mml:mrow></mml:msup><mml:mo>&#x02265;</mml:mo><mml:mn>200</mml:mn><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">GeV</mml:mtext></mml:mstyle></mml:math></inline-formula>, and <inline-formula><mml:math id="M2"><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">miss</mml:mtext></mml:mstyle></mml:mrow></mml:msup><mml:mo>/</mml:mo><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x02265;</mml:mo><mml:mn>0</mml:mn><mml:mo>.</mml:mo><mml:mn>2</mml:mn></mml:math></inline-formula>.</p></list-item>
<list-item><p>Channel 2a: <inline-formula><mml:math id="M3"><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">miss</mml:mtext></mml:mstyle></mml:mrow></mml:msup><mml:mo>&#x02265;</mml:mo><mml:mn>50</mml:mn><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">GeV</mml:mtext></mml:mstyle></mml:math></inline-formula> and at least three light leptons (muons or electrons) with <italic>p</italic><sub>T</sub>&#x0003E;15GeV.</p></list-item>
<list-item><p>Channel 2b: <inline-formula><mml:math id="M4"><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">miss</mml:mtext></mml:mstyle></mml:mrow></mml:msup><mml:mo>&#x02265;</mml:mo><mml:mn>50</mml:mn><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">GeV</mml:mtext></mml:mstyle></mml:math></inline-formula>, <italic>H</italic><sub>T</sub>&#x02265;50GeV and at least two light leptons (muons or electrons) with <italic>p</italic><sub>T</sub>&#x0003E;15GeV.</p></list-item>
<list-item><p>Channel 3: <italic>H</italic><sub>T</sub>&#x02265;600GeV, <inline-formula><mml:math id="M5"><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">miss</mml:mtext></mml:mstyle></mml:mrow></mml:msup><mml:mo>&#x02265;</mml:mo><mml:mn>100</mml:mn><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">GeV</mml:mtext></mml:mstyle></mml:math></inline-formula>.</p></list-item>
</list>
<p>where <italic>p</italic><sub>T</sub> is the magnitude of a particle&#x00027;s transverse momentum, <italic>H</italic><sub>T</sub> is the scalar sum of the jet <italic>p</italic><sub>T</sub> in the event, and <inline-formula><mml:math id="M6"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">miss</mml:mtext></mml:mstyle></mml:mrow></mml:msubsup></mml:math></inline-formula> is the vector equal and opposite to the vector sum of the transverse momenta of the reconstructed particles in the event, while <inline-formula><mml:math id="M7"><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">miss</mml:mtext></mml:mstyle></mml:mrow></mml:msup></mml:math></inline-formula> is its magnitude<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref>. More details are provided in Aarrestad et al. (<xref ref-type="bibr" rid="B5">2021</xref>).</p>
<p>The input consists of the momenta of all the reconstructed physics objects in the event (jets, b jets, electrons e, muons &#x003BC;, and photons), ordered by decreasing <italic>p</italic><sub>T</sub>. Each list of objects is zero-padded to force each event into a fixed-length matrix with the same order: up to 15 jets, and up to 4 each of b jets, &#x003BC;<sup>&#x000B1;</sup>, e<sup>&#x000B1;</sup>, and photons. We pre-process the input by applying the <monospace>scikit-learn</monospace> (Pedregosa et al., <xref ref-type="bibr" rid="B54">2011</xref>) standard scaling and arranging the list of objects into a matrix of 39 particles times four momentum features (<italic>E, p</italic><sub>T</sub>, &#x003B7;, &#x003D5;), where <italic>E</italic> is the particle energy. For e, &#x003BC;, and photons, the energy is computed assuming zero mass. For jets, the measured jet mass is used. The input matrix is interpreted as an image or an unordered point cloud, depending on the underlying VAE architecture.</p>
<p>The training and validation dataset consists of background events from the SM mixture. The available dataset size is detailed in <xref ref-type="table" rid="T1">Table 1</xref> for each of the channels. The background test samples are combined with the benchmark signal samples listed in <xref ref-type="table" rid="T2">Table 2</xref> to form the labeled test dataset on which performance is evaluated.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Summary of the available dataset size.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>Channel 1</bold></th>
<th valign="top" align="center"><bold>Channel 2a</bold></th>
<th valign="top" align="center"><bold>Channel 2b</bold></th>
<th valign="top" align="center"><bold>Channel 3</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Training</td>
<td valign="top" align="center">193, 800</td>
<td valign="top" align="center">13, 425</td>
<td valign="top" align="center">238, 450</td>
<td valign="top" align="center">7, 100, 934</td>
</tr>
<tr>
<td valign="top" align="left">Validation</td>
<td valign="top" align="center">10, 200</td>
<td valign="top" align="center">707</td>
<td valign="top" align="center">12, 550</td>
<td valign="top" align="center">373, 733</td>
</tr>
<tr>
<td valign="top" align="left">Bkg. test</td>
<td valign="top" align="center">10, 000</td>
<td valign="top" align="center">5, 868</td>
<td valign="top" align="center">89, 000</td>
<td valign="top" align="center">1, 025, 333</td>
</tr>
<tr>
<td valign="top" align="left">Sig. test</td>
<td valign="top" align="center">38, 666</td>
<td valign="top" align="center">5, 868</td>
<td valign="top" align="center">89, 676</td>
<td valign="top" align="center">1, 023, 320</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>BSM processes contributing to the signal dataset in each channel.</p></caption>
<table frame="hsides" rules="groups">
<thead><tr>
<th valign="top" align="left"><bold>BSM process</bold></th>
<th valign="top" align="center"><bold>Code</bold></th>
<th valign="top" align="center"><bold>Ch.1</bold></th>
<th valign="top" align="center"><bold>Ch.2a</bold></th>
<th valign="top" align="center"><bold>Ch.2b</bold></th>
<th valign="top" align="center"><bold>Ch.3</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Z&#x02032; &#x0002B; jet</td>
<td valign="top" align="center"><monospace>monojet_Zp2000.0_DM_50.0</monospace> </td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td/>
<td valign="top" align="center">&#x000D7;</td>
</tr>
<tr>
<td valign="top" align="left">Z&#x02032;&#x0002B; W/Z</td>
<td valign="top" align="center"><monospace>monoV_Zp2000.0_DM_50.0</monospace> </td>
<td/>
<td/>
<td/>
<td valign="top" align="center">&#x000D7;</td>
</tr>
<tr>
<td valign="top" align="left">Z&#x02032; &#x0002B; t</td>
<td valign="top" align="center"><monospace>monotop_200_A</monospace> </td>
<td valign="top" align="center">&#x000D7;</td>
<td/>
<td/>
<td valign="top" align="center">&#x000D7;</td>
</tr>
<tr>
<td valign="top" align="left">Z&#x02032; in LFV U(1)<sub>L<sub>&#x003BC;</sub>&#x02212;L<sub>&#x003C4;</sub></sub></td>
<td valign="top" align="center"><monospace>pp23mt_50</monospace> </td>
<td/>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td/>
</tr>
<tr>
<td/>
<td valign="top" align="center"><monospace>pp24mt_50</monospace> </td>
<td/>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">/<italic>R</italic>-SUSY <inline-formula><mml:math id="M8"><mml:mover accent="false"><mml:mrow><mml:mtext>t</mml:mtext></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mover accent="false"><mml:mrow><mml:mtext>t</mml:mtext></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:math></inline-formula></td>
<td valign="top" align="center"><monospace>stlp_st1000</monospace> </td>
<td valign="top" align="center">&#x000D7;</td>
<td/>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
</tr>
<tr>
<td valign="top" align="left">/<italic>R</italic>-SUSY <inline-formula><mml:math id="M9"><mml:mover accent="false"><mml:mrow><mml:mtext>q</mml:mtext></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mover accent="false"><mml:mrow><mml:mtext>q</mml:mtext></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:math></inline-formula></td>
<td valign="top" align="center"><monospace>sqsq1_sq1400_neut800</monospace> </td>
<td valign="top" align="center">&#x000D7;</td>
<td/>
<td/>
<td valign="top" align="center">&#x000D7;</td>
</tr>
<tr>
<td valign="top" align="left">SUSY <inline-formula><mml:math id="M10"><mml:mover accent="false"><mml:mrow><mml:mtext>g</mml:mtext></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mover accent="false"><mml:mrow><mml:mtext>g</mml:mtext></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:math></inline-formula></td>
<td valign="top" align="center"><monospace>glgl1400_neutralino1100</monospace> </td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
</tr>
<tr>
<td/>
<td valign="top" align="center"><monospace>glgl1600_neutralino800</monospace> </td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
</tr>
<tr>
<td valign="top" align="left">SUSY <inline-formula><mml:math id="M11"><mml:mover accent="false"><mml:mrow><mml:mtext>t</mml:mtext></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mover accent="false"><mml:mrow><mml:mtext>t</mml:mtext></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:math></inline-formula></td>
<td valign="top" align="center"><monospace>stop2b1000_neutralino300</monospace> </td>
<td valign="top" align="center">&#x000D7;</td>
<td/>
<td/>
<td valign="top" align="center">&#x000D7;</td>
</tr>
<tr>
<td valign="top" align="left">SUSY <inline-formula><mml:math id="M12"><mml:mover accent="false"><mml:mrow><mml:mtext>q</mml:mtext></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mover accent="false"><mml:mrow><mml:mtext>q</mml:mtext></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:math></inline-formula></td>
<td valign="top" align="center"><monospace>sqsq_sq1800_neut800</monospace> </td>
<td valign="top" align="center">&#x000D7;</td>
<td/>
<td/>
<td valign="top" align="center">&#x000D7;</td>
</tr>
<tr>
<td valign="top" align="left">SUSY <inline-formula><mml:math id="M13"><mml:msup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>&#x003C7;</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo>&#x000B1;</mml:mo></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>&#x003C7;</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula></td>
<td valign="top" align="center"><monospace>chaneut_cha200_neut50</monospace> </td>
<td/>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td/>
</tr>
<tr>
<td/>
<td valign="top" align="center"><monospace>chaneut_cha250_neut150</monospace> </td>
<td/>
<td valign="top" align="center">&#x000D7;</td>
<td valign="top" align="center">&#x000D7;</td>
<td/>
</tr>
<tr>
<td valign="top" align="left">SUSY <inline-formula><mml:math id="M14"><mml:msup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>&#x003C7;</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo>&#x000B1;</mml:mo></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>&#x003C7;</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo>&#x000B1;</mml:mo></mml:mrow></mml:msup></mml:math></inline-formula></td>
<td valign="top" align="center"><monospace>chacha_cha300_neut140</monospace> </td>
<td/>
<td/>
<td valign="top" align="center">&#x000D7;</td>
<td/>
</tr>
<tr>
<td/>
<td valign="top" align="center"><monospace>chacha_cha400_neut60</monospace> </td>
<td/>
<td/>
<td valign="top" align="center">&#x000D7;</td>
<td/>
</tr>
<tr>
<td/>
<td valign="top" align="center"><monospace>chacha_cha600_neut200</monospace> </td>
<td/>
<td/>
<td valign="top" align="center">&#x000D7;</td>
<td/>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="TN1"><p><italic>The process code, adopted in this study, is taken from Aarrestad et al. (<xref ref-type="bibr" rid="B5">2021</xref>)</italic>.</p></fn>
</table-wrap-foot>
</table-wrap></sec>
<sec id="s3">
<title>3. Training Setup and Evaluation Metrics</title>
<p>Variational Autoencoders (Kingma and Welling, <xref ref-type="bibr" rid="B44">2014</xref>, <xref ref-type="bibr" rid="B45">2019</xref>; Rezende et al., <xref ref-type="bibr" rid="B56">2014</xref>) are a class of likelihood-based generative models that maximize the likelihood of the training data according to the generative model &#x0220F;<sub><italic>x</italic>&#x02208;data</sub><italic>p</italic><sub>&#x003B8;</sub>(<italic>x</italic>) for the set of observed variables <italic>x</italic> in the training data. To achieve this in a tractable way, the generative model is augmented by the introduction of a set of latent variables <italic>z</italic>, such that the marginal distribution over the observed variables <italic>p</italic><sub>&#x003B8;</sub>(<italic>x</italic>), is given by: <italic>p</italic><sub>&#x003B8;</sub>(<italic>x</italic>) &#x0003D; &#x0222B;<italic>p</italic><sub>&#x003B8;</sub>(<italic>x</italic>|<italic>z</italic>)<italic>q</italic><sub>&#x003B8;</sub>(<italic>z</italic>)<italic>dz</italic>. In this way, <italic>q</italic><sub>&#x003B8;</sub>(<italic>z</italic>) can be a relatively simple distribution, such as a Gaussian, while maintaining high expressivity for the marginal distribution <italic>p</italic><sub>&#x003B8;</sub>(<italic>x</italic>) as an infinite mixture of simple distributions controlled by <italic>z</italic>. Besides being used as generative models, VAEs have been shown to be effective as anomaly detection algorithms (An and Cho, <xref ref-type="bibr" rid="B7">2015</xref>).</p>
<p>In this work, the VAE models are trained on the training and validation datasets, minimizing the loss function:</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M15"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext>total</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x003B2;</mml:mi><mml:msub><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mrow><mml:mtext>KL</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>&#x003B2;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext>C</mml:mtext></mml:mrow></mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>L</italic><sub>C</sub> is a reconstruction loss, which is chosen to be an L<sub>1</sub>-type permutation-invariant Chamfer loss (Barrow et al., <xref ref-type="bibr" rid="B9">1977</xref>):</p>
<disp-formula id="E22"><label>(2)</label><mml:math id="M60"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mtext>C</mml:mtext></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle='true'><mml:munder><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mover accent='true'><mml:mi>x</mml:mi><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mtext>input</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mrow><mml:munder><mml:mrow><mml:mtext>min</mml:mtext></mml:mrow><mml:mrow><mml:mover accent='true'><mml:mi>y</mml:mi><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mtext>output</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:munder></mml:mrow></mml:mstyle><mml:mo stretchy="false">&#x0007C;</mml:mo><mml:mover accent='true'><mml:mi>x</mml:mi><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo>&#x02212;</mml:mo><mml:mover accent='true'><mml:mi>y</mml:mi><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo stretchy="false">&#x0007C;</mml:mo><mml:mo>+</mml:mo><mml:mstyle displaystyle='true'><mml:munder><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mover accent='true'><mml:mi>y</mml:mi><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mtext>output</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mrow><mml:munder><mml:mrow><mml:mtext>min</mml:mtext></mml:mrow><mml:mrow><mml:mover accent='true'><mml:mi>x</mml:mi><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mi>S</mml:mi><mml:mrow><mml:mtext>input</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:munder></mml:mrow></mml:mstyle><mml:mo stretchy="false">&#x0007C;</mml:mo><mml:mover accent='true'><mml:mi>x</mml:mi><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo>&#x02212;</mml:mo><mml:mover accent='true'><mml:mi>y</mml:mi><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo stretchy="false">&#x0007C;</mml:mo><mml:mo>&#x000A0;</mml:mo><mml:mo>,</mml:mo></mml:mrow></mml:math></disp-formula>
<p>similar to the L<sub>2</sub>-type Chamfer distance used in Fan et al. (<xref ref-type="bibr" rid="B29">2017</xref>) and Zhang et al. (<xref ref-type="bibr" rid="B64">2020</xref>). In Equation (2), <italic>D</italic><sub>KL</sub> is the Kullback&#x02013;Liebler divergence term usually employed to force the data distribution in the latent space to a multidimensional Gaussian with unitary covariance matrix (Rezende and Mohamed, <xref ref-type="bibr" rid="B55">2015</xref>), and &#x003B2; is a parameter that controls the relative importance of the two terms (Higgins et al., <xref ref-type="bibr" rid="B38">2017</xref>).</p>
<p>All of our models are optimized using the Adam minimizer (Kingma and Ba, <xref ref-type="bibr" rid="B42">2015</xref>). A learning rate of 10<sup>&#x02212;4</sup> is applied along with a brute force early stopping strategy used on an ad-hoc basis. A batch size of 32 is chosen to train models. All models are implemented with the <monospace>PyTorch</monospace> (Paszke et al., <xref ref-type="bibr" rid="B53">2019</xref>) deep learning framework and are hosted on GitHub (Jawahar and Pierini, <xref ref-type="bibr" rid="B39">2021</xref>).</p>
<p>We train and test all our models on the WPI Turing Research Cluster<xref ref-type="fn" rid="fn0002"><sup>2</sup></xref>, using 8 CPU nodes and 1 GPU node (NVIDIA Tesla V100 or Tesla P100).</p>
<p>At inference time, <italic>L</italic><sub>C</sub> is used as an anomaly detection score, to quantify the distance between the input and the output. By applying a lower-bound threshold on <italic>L</italic><sub>C</sub>, we identify every event with an <italic>L</italic><sub>C</sub> value larger than the threshold as an anomaly. By comparing this prediction to the ground truth, we can assess the performance of the VAE on specific signal benchmark models.</p>
<p>To evaluate model performance we follow the same strategy and code used in Aarrestad et al. (<xref ref-type="bibr" rid="B5">2021</xref>) to enable comparison with other models tested on this dataset. As explained in Aarrestad et al. (<xref ref-type="bibr" rid="B5">2021</xref>), we extract four main performance parameters from the receiver operating characteristic (ROC) curves based on the chosen anomaly metric for each model, namely the area under the curve (AUC) and true positive rate (also known as the signal efficiency &#x003F5;<sub>S</sub>) at three different, fixed values of the false positive rate (also known as background efficiency &#x003F5;<sub>B</sub>). We then combine these scores from all models on all available signal regions across all channels of the dataset to form box-and-whisker plots, using six different combination and comparison strategies namely, the highest mean score method, highest median score method, average rank method, top scorer method, top-5 scorer method, and highest minimum scorer method. A box is drawn spanning the inner half (50% quantile centered at the median) of the data as shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. A line through the box marks the median. Whiskers extend from the box to either the maximum and minimum unless these are further away from the edge of the box than 1.5 box lengths. The outlier points are shown as circles.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Anomaly detection performance for the Conv-VAE with different inputs given (see text for more details): all physics objects in the event (AllObj); truncated input object list (TrdObj); all objects and array of object multiplicity (AllObj&#x0002B;Mult); truncated input object list and array of object multiplicity (TrdObj&#x0002B;Mult).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-05-803685-g0001.tif"/>
</fig>
<p>For <xref ref-type="fig" rid="F1">Figure 1</xref> and the other figures, the representative ranking as denoted by the legend corresponds to the performance based on the highest mean score method unless mentioned otherwise. However, to choose the best model for each experiment described in this article, we consider all six comparison methods to arrive at a consensus. The code to perform these comparisons and to generate the corresponding plots is available in Aarrestad et al. (<xref ref-type="bibr" rid="B5">2021</xref>).</p></sec>
<sec id="s4">
<title>4. Baseline VAE Model</title>
<p>The main goal of this study is to evaluate the impact of normalizing flow layers in the latent space on the anomaly detection capability of a reference VAE model. This and the following sections describe how this reference model is built, starting from the VAE based on convolutional layers (Conv-VAE) presented in Aarrestad et al. (<xref ref-type="bibr" rid="B5">2021</xref>) and modifying its architecture based on some of the lessons learned during the DarkMachine challenge.</p>
<p>The encoder of the initial Conv-VAE consists of three convolutional layers, with 32, 16, and 8 kernels of size (3, 4), (5, 1), and (7, 1), respectively. For all layers, the stride is set to 1 and zero padding to &#x0201C;same.&#x0201D; The output of the convolutional layers is flattened and passed to two fully-connected neural network (FCN) layers that output the mean and variance for the latent space. The cardinality of the latent space is fixed to 15. The decoder mirrors the encoder architecture, returning an output of the same size as the input.</p>
<p>In order to define the reference model, the architecture of the starting model is modified in different ways, each time evaluating the impact of a given choice on the test dataset. Several possibilities are considered: how to embed the event in the two-dimensional (2D) array (see Section 4.1); how to interpret the array, e.g., as an image or a graph (see Section 4.2); whether to extend the event representation beyond the particle momenta, adding domain-specific high level features as an additional input (see Section 4.3); and which anomaly score to use (see Section 4.4). We study various options for each of these points, following this order. Doing so, we establish a candidate model, which replaces the initial model. We evaluate on this new model the benefit of using normalizing flow layers in the latent space (see Section 5) to improve the anomaly detection accuracy.</p>
<sec>
<title>4.1. Data Representation</title>
<p>By their nature, events consist of a variable number of objects. To some extent, this conflicts with most neural network architectures, which assume a fixed-size input. As a baseline, we adopt the simplest solution, i.e., to zero-pad all events to standardized event sizes for all available samples. To get a better idea of how padding affects results, we study performance across alternative input encodings. We consider two main types of encodings, listed as AllObj and TrdObj in <xref ref-type="fig" rid="F1">Figure 1</xref>. The former involves considering the entire event which implies allowing for a large enough padding such that every object per event is taken into consideration across the entire dataset. The latter involves cutting down the padding and the input sequence by considering only up to four leading jets and three objects each of the other types per event.</p>
<p>When using the truncated sequence, the model loses information regarding the number of objects of each type per event, which is implicitly learned when the whole sequence is considered. To compensate for this loss, one can explicitly add this information passing a second input to the model, consisting of a vector containing the multiplicities of each object type. This input is concatenated to the flattened output received from the convolutional layers in the encoder before passing them to the fully connected layers. For the sake of comparison, we also do the same for the AllObj case (labeled as &#x0201C;&#x0002B;Mult&#x0201D; in <xref ref-type="fig" rid="F1">Figure 1</xref>).</p>
<p>The results in <xref ref-type="fig" rid="F1">Figure 1</xref> show that the truncated sequence does worse than the full sequence. We also see little improvement in performance with the addition of multiplicity information per event in both the AUC as well as performance at lower background efficiencies. As a result, we keep the input encoding that considers the complete sequence per event.</p></sec>
<sec>
<title>4.2. VAE Architecture</title>
<p>The convolutional architecture used for the baseline VAE is not the only option to handle the input considered in this study. The ensemble of reconstructed particles in an event can be represented as a point cloud. Doing so, we can process it with a graph neural network. The main advantage of this choice stands with the permutation invariance of the graph processing, which pairs that of the loss in Equation (2) and complies with the unordered nature of the input list of particles. Graph-based architectures have already been shown to perform better with sparse, non-Euclidean data representations in general (Bronstein et al., <xref ref-type="bibr" rid="B13">2017</xref>; Zhou et al., <xref ref-type="bibr" rid="B66">2020</xref>) and in particle physics in particular (Duarte and Vlimant, <xref ref-type="bibr" rid="B27">2020</xref>; Shlomi et al., <xref ref-type="bibr" rid="B57">2020</xref>).</p>
<p>To this end, we consider a GCN-VAE model composed of multilayer graph convolutional network layers (GCNs) (Kipf and Welling, <xref ref-type="bibr" rid="B46">2017</xref>) and FCN layers in both the encoder and the decoder. As for the VAE, the input graphs are built from the input list described in Section 2, each particle representing one vertex of the graph in the space identified by five particle features: <italic>E, p</italic><sub>T</sub>, &#x003B7;, &#x003D5;, and object type. The object type is a label-encoded integer that signifies the object type. The input is structured as a fully connected, undirected graph which is passed to the GCN layers of the encoder, defined as (Kipf and Welling, <xref ref-type="bibr" rid="B46">2017</xref>):</p>
<disp-formula id="E3"><label>(3)</label><mml:math id="M17"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:msup><mml:mover accent="false"><mml:mrow><mml:mi>A</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:msup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:msup><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>l</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x000A0;</mml:mtext><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>H</italic><sub>(<italic>l</italic>)</sub> is the input to the (<italic>l</italic>&#x0002B;1)th GCN layer with <italic>H</italic><sub>(0)</sub> &#x0003D; <italic>X</italic> where <italic>X</italic> represents the node feature matrix. <italic>H</italic><sub>(<italic>l</italic>&#x0002B;1)</sub> is the layer output, <inline-formula><mml:math id="M18"><mml:mover accent="false"><mml:mrow><mml:mi>A</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mi>A</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mi>I</mml:mi></mml:math></inline-formula>, where <italic>A</italic> is the adjacency of the graph, with <italic>I</italic> being the identity matrix which implies added self connections for each node. <inline-formula><mml:math id="M19"><mml:msub><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mi>D</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mrow><mml:mi>A</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is defined for the normalized adjacency based message passing regime, <italic>W</italic><sub>(<italic>l</italic>)</sub> is the layer weights matrix and &#x003C3;(&#x02022;) is a suitable nonlinear activation function. The output of the last GCN layer is flattened and passed to an FCN layer which populates the latent space. The encoder has three GCN layers that scale the 5 node features to 32, 16, and 2 respectively, followed by a single FCN layer which generates a 15-dimensional latent space. The decoder has a symmetrically inverted structure with the sampled point being upscaled through an FCN layer first and the resulting output is reshaped and passed to GCN layers that reconstruct the node features.</p>
<p>Considering all comparison metrics along with the representative results shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, graph architectures exhibit a definitive improvement in performance compared to the Conv-VAE. The improvement is seen not only in the AUC metric, but more significantly in the &#x003F5;<sub>S</sub> at low &#x003F5;<sub>B</sub>. Because of this, the GCN-VAE is used as the reference architecture in the rest of this section and in Section 5.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Comparison of the GCN-VAE and Conv-VAE performances, in terms of the benchmark figures of merit adopted in the article.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-05-803685-g0002.tif"/>
</fig></sec>
<sec>
<title>4.3. Physics-Motivated High-Level Features</title>
<p>We also experiment with adding physics-motivated high-level features, as explicit inputs to the model, similar to what was done with object multiplicities in Section 4.1. Doing so, we intend to check if domain knowledge helps in improving anomaly detection capability. We pass event information such as the missing transverse momentum in the event (<inline-formula><mml:math id="M20"><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">miss</mml:mtext></mml:mstyle></mml:mrow></mml:msup></mml:math></inline-formula>), the scalar sum of the jet <italic>p</italic><sub>T</sub> (<italic>H</italic><sub>T</sub>) and <inline-formula><mml:math id="M21"><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mtext>Eff</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">miss</mml:mtext></mml:mstyle></mml:mrow></mml:msup></mml:math></inline-formula> to the model, by concatenating these with the output of the convolutional layers of the encoder. The concatenated output is then passed to the fully connected layers in the encoder to form the latent space. After the point sampled from the latent space passes through the fully connected layers of the decoder, the reconstructed <inline-formula><mml:math id="M22"><mml:msup><mml:mrow><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mtext>T</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">miss</mml:mtext></mml:mstyle></mml:mrow></mml:msup></mml:math></inline-formula>, <italic>H</italic><sub>T</sub>, and <italic>m</italic><sub>Eff</sub> are extracted and the rest of the layer output is re-shaped and further passed to the subsequent layers of the decoder.</p>
<p>To include the reconstruction of these features in the loss, we add to Equation (1) a mean-squared error (MSE) term, computed from the reconstructed and input high-level features and weighted by a coefficient. This coefficient is treated as a hyperparameter that is scanned until the best performance is found.</p>
<p><xref ref-type="fig" rid="F3">Figure 3</xref> shows that adding high-level features brings no definitive improvement in performance, thereby leading us to conclude that the baseline model with marginally lower number of trainable parameters is a good choice.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Comparison of the GCN-VAE performance with and without high-level features added as a separate input.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-05-803685-g0003.tif"/>
</fig></sec>
<sec>
<title>4.4. Anomaly Scores</title>
<p>While so far the Chamfer loss has been used as the anomaly score, this is not the only possibility. We consider two alternative metrics: the <italic>D</italic><sub>KL</sub> term in Equation (1) and (Aarrestad et al., <xref ref-type="bibr" rid="B5">2021</xref>):</p>
<disp-formula id="E4"><label>(4)</label><mml:math id="M23"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mi>z</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where &#x003BC; and &#x003C3; are the mean and RMS returned by the encoder and the index <italic>i</italic> runs across the latent-space dimensions.</p>
<p>The use of different anomaly scores requires a tuning of the &#x003B2; hyperparameter. Since &#x003B2; determines the relative importance of the <italic>D</italic><sub>KL</sub> and Chamfer loss terms in the loss, the use of one or the other as anomaly score is certainly related to the choice of the optimal &#x003B2; value. Similarly, the use of <italic>R</italic><sub><italic>z</italic></sub> (i.e., anomaly detection in the latent space) might not be optimal when using a &#x003B2; value that was tuned to emphasize the reconstruction accuracy (i.e., the minimization of the Chamfer term in the loss). On the other hand, the study in Aarrestad et al. (<xref ref-type="bibr" rid="B5">2021</xref>) shows that an excessive tuning of the hyperparameters affects generalization of performance negatively beyond the available dataset.</p>
<p>In order to address this point, we compare three weights for the &#x003B2; term. The first case (&#x003B2; &#x0003D; 1) corresponds to training the VAE without the contribution of the reconstruction loss. In the second case (&#x003B2; &#x0003D; 0.5) the two contributions are equally weighted. The final case (&#x003B2; &#x0003D; 10<sup>&#x02212;6</sup>) corresponds to suppressing the <italic>D</italic><sub>KL</sub> term to a negligible level.</p>
<p><xref ref-type="fig" rid="F4">Figure 4</xref> shows that all three anomaly scores underperform in the &#x003B2; &#x0003D; 10<sup>&#x02212;6</sup> case. The best performing models overall are the &#x003B2; &#x0003D; 1 and &#x003B2; &#x0003D; 0.5 cases. Comparing across the three different anomaly scores, we see that the &#x003B2; &#x0003D; 1 model that uses <italic>D</italic><sub>KL</sub> and <italic>R</italic><sub><italic>z</italic></sub> metrics, as well as the &#x003B2; &#x0003D; 0.5 model that uses the reconstruction metric perform the best. All three cases also show very similar performance across all comparison metrics as well as methods, implying that either model-anomaly score combination is equally suitable. We also find that the &#x003B2; &#x0003D; 1 <italic>D</italic><sub>KL</sub> score and the &#x003B2; &#x0003D; 0.5 reconstruction score show a similar correlation pattern on signal and background. As a result, we expect that only a limited improvement would be obtained by combining the two, which spares us the cost of introducing a new hyperparameter (the relative weight of the two terms) whose optimal value would be signal-specific, as in the case of Caron et al. (<xref ref-type="bibr" rid="B14">2021</xref>).</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Comparison of anomaly detection performance from different anomaly score definitions, applied to the GCN-VAE.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-05-803685-g0004.tif"/>
</fig></sec>
<sec>
<title>4.5. Baseline Discrimination</title>
<p>As a result of the tests presented so far, the baseline VAE model is established as a GCN-VAE taking as input the whole set of reconstructed physics object but no domain-specific high level features. The Chamfer loss function is used as the anomaly score. The GCN-VAE is trained and tested only with data available within a given channel and the dataset sizes per channel are described in <xref ref-type="table" rid="T1">Table 1</xref>. <xref ref-type="fig" rid="F5">Figure 5</xref> shows the ROC curves for the baseline VAE model on benchmark signals in the four channels. It is evident that we suffer from a shortage of events for some signal models at very low &#x003F5;<sub>B</sub>. We still show ROC curves down to <inline-formula><mml:math id="M24"><mml:msub><mml:mrow><mml:mi>&#x003F5;</mml:mi></mml:mrow><mml:mrow><mml:mtext>B</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:msup><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> to allow one to compare our results to those in Aarrestad et al. (<xref ref-type="bibr" rid="B5">2021</xref>), where this range was chosen. We see an overall improvement in &#x003F5;<sub>S</sub> at very low &#x003F5;<sub>B</sub> for the GCN-VAE compared to our Conv-VAE submission in Aarrestad et al. (<xref ref-type="bibr" rid="B5">2021</xref>).</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>ROC curves for the baseline GCN-VAE model in channel 1 (top left), channel 2a (top right), channel 2b (bottom left), and channel 3 (bottom right), computed from the &#x003F5;<sub>S</sub> and &#x003F5;<sub>B</sub> values obtained on the background sample and the benchmark signal samples. Most of the ROC curves are not smooth, due to the small dataset size for some of the channels.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-05-803685-g0005.tif"/>
</fig></sec></sec>
<sec id="s5">
<title>5. Normalizing Flows</title>
<p>With the GCN-VAE serving as the baseline, we investigate how the use of NFs (Tabak and Vanden-Eijnden, <xref ref-type="bibr" rid="B60">2010</xref>; Tabak and Turner, <xref ref-type="bibr" rid="B59">2013</xref>) impacts the anomaly-detection performance. Normalizing flow layers are inserted between the Gaussian sampling and the decoder. They provide additional complexity to learn better posterior distributions (Rezende and Mohamed, <xref ref-type="bibr" rid="B55">2015</xref>) by morphing the multivariate prior of the latent space to a more suitable, learned function.</p>
<p>In other words, we use the NF layers to handle the fact that a VAE converging to a good output-to-input matching does not necessarily correspond to a configuration with a Gaussian prior in the latent space, <italic>p</italic>(<italic>z</italic>) &#x0003D; &#x0220F;<italic>G</italic>(<italic>z</italic>). To reach this configuration (e.g., when training a VAE as a generative model), one typically uses a &#x003B2;-VAE with an increased weighting of the <italic>D</italic><sub>KL</sub> regularizer. This typically results in a degradation of the output-to-input matching. With NFs, we learn a generic prior <italic>p</italic>(<italic>z</italic>) as <italic>f</italic>[<italic>G</italic>(<italic>z</italic>)], where <italic>f</italic> is the transformation function learned by the NF layers. This is different from the way NFs are traditionally used in VAE training, i.e., to improve the convergence of <italic>f</italic>(<italic>z</italic>) to <italic>G</italic>(<italic>z</italic>) with a stronger evidence lower bound (ELBO) condition. Because of this, we do not modify the <italic>D</italic><sub>KL</sub> term in the loss, as done in Rezende and Mohamed (<xref ref-type="bibr" rid="B55">2015</xref>). The results obtained following this more traditional training procedure are described in the <xref ref-type="supplementary-material" rid="SM1">Supplementary Material</xref>. Doing so, we observe worse &#x003F5;<sub>S</sub> for the same &#x003F5;<sub>B</sub>. This is expected because the ELBO improvement with NFs was introduced in Tomczak and Welling (<xref ref-type="bibr" rid="B61">2017</xref>) as a way to improve the VAE generative properties, and it does not imply a better anomaly detection capability.</p>
<p>A NF can be generalized as any invertible, diffeomorphic transformation that can be applied to a given distribution to produce tractable distributions (Kobyzev et al., <xref ref-type="bibr" rid="B47">2020</xref>; Papamakarios et al., <xref ref-type="bibr" rid="B51">2021</xref>). In order to be compatible with variational inference, it is desirable for the transformations to have an efficient mechanism for computing the determinant of the Jacobian, while being invertible (Rezende and Mohamed, <xref ref-type="bibr" rid="B55">2015</xref>). The NFs are trained sequentially, together with the baseline VAE model.</p>
<p>We utilize four major families of flow models:</p>
<list list-type="bullet">
<list-item><p><bold>Planar flows</bold> are invertible transformations whose Jacobian determinant can be computed rather efficiently, making them suitable candidates for variational inference (Rezende and Mohamed, <xref ref-type="bibr" rid="B55">2015</xref>). PF transformations are defined as:</p>
<p><disp-formula id="E5"><label>(5)</label><mml:math id="M25"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">u</mml:mtext><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mtext class="textrm" mathvariant="normal">w</mml:mtext></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msup><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x000A0;&#x000A0;</mml:mtext><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p>
<p>where u, w&#x02208;&#x0211D;<sup><italic>D</italic></sup>, <italic>b</italic>&#x02208;&#x0211D; and <italic>h</italic> is a suitable smooth activation function.</p></list-item>
<list-item><p><bold>Sylvester normalizing flows</bold> (SNFs) (Berg et al., <xref ref-type="bibr" rid="B10">2018</xref>) build on the planar flow formulation and extend it to be analogous to a multilayer perceptron with one hidden layer of <italic>M</italic> units and a residual connection as:</p>
<p><disp-formula id="E6"><label>(6)</label><mml:math id="M26"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>B</mml:mtext></mml:mstyle><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:mi>b</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x000A0;&#x000A0;</mml:mtext><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p>
<p>where A&#x02208;&#x0211D;<sup><italic>D</italic> &#x000D7; <italic>M</italic></sup>, B&#x02208;&#x0211D;<sup><italic>M</italic> &#x000D7; <italic>D</italic></sup>, <italic>b</italic>&#x02208;&#x0211D;<sup><italic>M</italic></sup> and <italic>M</italic> &#x02264; <italic>D</italic>. Computing the Jacobian determinant for such a formulation is made more efficient by utilizing the Sylvester determinant identity (Berg et al., <xref ref-type="bibr" rid="B10">2018</xref>). Depending on the way <italic>A</italic> and <italic>B</italic> are parameterized, we get different types of SNFs. In this article, we use orthogonal, Householder, and triangular SNFs, as described in Berg et al. (<xref ref-type="bibr" rid="B10">2018</xref>).</p></list-item>
<list-item><p><bold>Inverse autoregressive flows</bold> (IAFs) (Kingma et al., <xref ref-type="bibr" rid="B43">2016</xref>) are computation-efficient normalizing flows based on autoregressive models. Autoregressive transformations are invertible, making them suitable candidates for our case. However, computing the transformation requires multiple sequential steps (Berg et al., <xref ref-type="bibr" rid="B10">2018</xref>). The inverse transformation however, leads to certain simplifications as described in Berg et al. (<xref ref-type="bibr" rid="B10">2018</xref>), allowing more efficient parallel computing, thereby making it a more desirable transformation for our case. We use the IAFs formulated as:</p>
<p><disp-formula id="E7"><label>(7)</label><mml:math id="M27"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>:</mml:mo><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>:</mml:mo><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mtext>&#x02003;</mml:mtext><mml:mo>,</mml:mo><mml:mtext>&#x02003;</mml:mtext><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:mi>D</mml:mi><mml:mtext>&#x000A0;</mml:mtext><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p>
<p>Such a formulation allows one to stack multiple transformations to achieve more flexibility in producing target distributions.</p></list-item>
<list-item><p><bold>Convolutional normalizing flows</bold> (ConvolutionalFlows) (Zheng et al., <xref ref-type="bibr" rid="B65">2018</xref>) are an extension of single-hidden-unit planar flows (Kingma et al., <xref ref-type="bibr" rid="B43">2016</xref>) to the case of multiple hidden units, further enhanced by replacing the fully connected network operation with a one-dimensional (1D) convolution, to achieve bijectivity. They are defined by the following transformation:</p>
<p><disp-formula id="E8"><label>(8)</label><mml:math id="M28"><mml:mtable class="eqnarray" columnalign="right center left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>u</mml:mtext></mml:mstyle><mml:mo>&#x02299;</mml:mo><mml:mi>h</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtext class="textrm" mathvariant="normal">conv</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>w</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mtext>&#x000A0;&#x000A0;</mml:mtext><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p>
<p>where <italic>w</italic>&#x02208;<italic>R</italic><sup><italic>k</italic></sup> is the parameter of the 1D convolution filter with <italic>k</italic>-sized kernel, <italic>h</italic> is a monotonic nonlinear activation function and &#x02299; denotes pointwise multiplication.</p></list-item>
<list-item><p><bold>Autoregressive neural spline flows</bold> (NSFARs) (Durkan et al., <xref ref-type="bibr" rid="B28">2019</xref>) are similar to IAFs, where affine transforms are replaced by monotonic rational-quadratic spline transforms as described in Durkan et al. (<xref ref-type="bibr" rid="B28">2019</xref>). They resemble a traditional feed-forward neural network architecture, alternating between linear transformations and elementwise non-linearities, while retaining an exact, analytic inverse.</p></list-item>
</list>
<p>The hyperparameters for each normalizing flow architecture are chosen arbitrarily to avoid overtuning on the available dataset as learned from Aarrestad et al. (<xref ref-type="bibr" rid="B5">2021</xref>). The planar flow model consists of a stack of six flows, each made of three dense layers with 90 neurons each. SNFs are defined by stacking six flows with eight orthogonal, householder, and triangular transformations for each of the respective types of SNF. IAFs are constructed with four masked autoencoder for distribution estimation (MADE) (Germain et al., <xref ref-type="bibr" rid="B32">2015</xref>) layers as described in Kingma et al. (<xref ref-type="bibr" rid="B43">2016</xref>), each containing 330 neurons. ConvolutionalFlows include four flow layers with kernel size <italic>k</italic> &#x0003D; 7 and applying kernel dilation as described in Zheng et al. (<xref ref-type="bibr" rid="B65">2018</xref>). NSFARs are defined by stacking four flow layers each with <italic>K</italic> &#x0003D; 64 bins and eight hidden features.</p>
<p><xref ref-type="fig" rid="F6">Figure 6</xref> shows the results of all GCN-VAE models combined with all the different types of flows as described in Section 5. Based on results from all data channels combined through all six strategies mentioned in Section 3, and considering variance across trainings from different random seeds (see <xref ref-type="supplementary-material" rid="SM1">Supplementary Material</xref>), it is evident that using normalizing flows improves not only the AUC metric but also the signal efficiencies at low background efficiencies. We find that the Householder variant of SNFs produces the best improvement with respect to the baseline GCN-VAE model. The exercise was also repeated with a Conv-VAE model and similar trends were observed. There, the normalizing flows showed a larger improvement from the baseline Conv-VAE than for the GCN-VAE model but the overall results are less accurate than that of GCN-VAE with normalizing flows.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>Comparison of anomaly detection performance for GCN-VAE models with different normalizing flow architectures in the latent space.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-05-803685-g0006.tif"/>
</fig>
<p><xref ref-type="fig" rid="F7">Figure 7</xref> shows the ROC curves for the best presented model, GCN-VAE_HouseholderSNF across all available signal samples in all data channels. For some of the samples, the small dataset size translates in a discontinuous curve and larger uncertainties.</p>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>ROC curves of GCN-VAE_HouseholderSNF for all signals in each of channel 1 (top left), channel 2a (top right), channel 2b (bottom left), and channel 3 (bottom right).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-05-803685-g0007.tif"/>
</fig></sec>
<sec sec-type="conclusions" id="s6">
<title>6. Conclusions</title>
<p>We constructed a graph-based anomaly detection model to identify new physics events in the DarkMachines challenge dataset. Inspired by the outcome of this challenge, specific model design choices (data representation, use of physics-motivated high-level features, and anomaly score definition) were further optimized in order to maximize anomaly detection performance. As the case for many other deep learning applications to particle-physics data, we observed that the graph architecture better captures the point-cloud nature of this data, resulting in an enhanced performance.</p>
<p>In this baseline, we investigate the impact of using a stack of normalizing flows in the latent space of the variational autoencoder (VAE), between the Gaussian sampling and the decoding, in order to improve the accuracy of the prior learning process, by morphing the Gaussian prior to a more suitable function, learned during the training.</p>
<p>Testing the trained model on a set of benchmark signal samples, we observe an overall improvement when normalizing flows are used, with the Householder variant of the Sylvester normalizing flow model giving the best results. With that, we reach a median anomaly identification probability of 72% (34%) for an &#x003F5;<sub>B</sub> of 1% (0.1%) across all signal samples over all available channels. The median anomaly identification probability increases to 95% (96%) for an &#x003F5;<sub>B</sub> of 30% (60%).</p>
<p>This work presents an improvement over our Conv-VAE model, submitted to the DarkMachines challenge (Aarrestad et al., <xref ref-type="bibr" rid="B5">2021</xref>).</p></sec>
<sec sec-type="data-availability" id="s7">
<title>Data Availability Statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="supplementary-material" rid="SM1">Supplementary Material</xref>, further inquiries can be directed to the corresponding author/s.</p></sec>
<sec id="s8">
<title>Author Contributions</title>
<p>All authors in equal share developed the baseline VAE model. All authors in equal share took part in writing and editing the manuscript. PJ and MP developed the VAE &#x0002B; normalizing flow models. All authors contributed to the article and approved the submitted version.</p></sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>PJ, TA, MP, and KW were supported by the European Research Council (ERC) under the European Union&#x00027;s Horizon 2020 research and innovation program (Grant Agreement No. 772369). JD was supported by the U.S. Department of Energy (DOE), Office of Science, Office of High Energy Physics Early Career Research program under Award No. DE-SC0021187. ST was supported by the University of California San Diego Triton Research and Experiential Learning Scholars (TRELS) program. JN was supported by Fermi Research Alliance, LLC under Contract No. DE-AC02-07CH11359 with the U.S. Department of Energy, Office of Science, Office of High Energy Physics.</p></sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec> </body>
<back>
<sec sec-type="supplementary-material" id="s11">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fdata.2022.803685/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fdata.2022.803685/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.PDF" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Aaboud</surname> <given-names>M.</given-names></name> <name><surname>Aad</surname> <given-names>G.</given-names></name> <name><surname>Abbott</surname> <given-names>B.</given-names></name> <name><surname>Abdinov</surname> <given-names>O.</given-names></name> <name><surname>Abeloos</surname> <given-names>B.</given-names></name> <name><surname>Abidi</surname> <given-names>S. H.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>A strategy for a general search for new phenomena using data-derived signal regions and its application within the ATLAS experiment</article-title>. <source>Eur. Phys. J. C</source> <volume>79</volume>:<fpage>120</fpage>. <pub-id pub-id-type="doi">10.1140/epjc/s10052-019-6540-y</pub-id></citation>
</ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Aad</surname> <given-names>G.</given-names></name> <name><surname>Abbott</surname> <given-names>B.</given-names></name> <name><surname>Abbott</surname> <given-names>D. C.</given-names></name> <name><surname>Abed Abud</surname> <given-names>A.</given-names></name> <name><surname>Abeling</surname> <given-names>K.</given-names></name> <name><surname>Abhayasinghe</surname> <given-names>D. K.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Operation of the ATLAS trigger system in Run 2</article-title>. <source>J. Instrum</source>. <volume>15</volume>:<fpage>P10004</fpage>. <pub-id pub-id-type="doi">10.1088/1748-0221/15/10/P10004</pub-id></citation>
</ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Aaltonen</surname> <given-names>T.</given-names></name> <name><surname>Adelman</surname> <given-names>J.</given-names></name> <name><surname>Akimoto</surname> <given-names>T.</given-names></name> <name><surname>Albrow</surname> <given-names>M. G.</given-names></name> <name><surname>&#x000C1;lvarez Gonzlez</surname> <given-names>B.</given-names></name> <name><surname>Amerio</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2009</year>). <article-title>Global search for new physics with 2.0 fb<sup>&#x02212;</sup>1 at CDF</article-title>. <source>Phys. Rev. D</source> <volume>79</volume>:<fpage>011101</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevD.79.011101</pub-id></citation>
</ref>
<ref id="B4">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Aaron</surname> <given-names>F. D.</given-names></name> <name><surname>Alexa</surname> <given-names>C.</given-names></name> <name><surname>Andreev</surname> <given-names>V.</given-names></name> <name><surname>Antunovic</surname> <given-names>B.</given-names></name> <name><surname>Aplin</surname> <given-names>S.</given-names></name> <name><surname>Asmone</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2009</year>). <article-title>A general search for new phenomena at HERA</article-title>. <source>Phys. Lett. B</source> <volume>674</volume>, <fpage>257</fpage>&#x02013;<lpage>268</lpage>. <pub-id pub-id-type="doi">10.1016/j.physletb.2009.03.034</pub-id></citation>
</ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Aarrestad</surname> <given-names>T.</given-names></name> <name><surname>van Beekveld</surname> <given-names>M.</given-names></name> <name><surname>Bona</surname> <given-names>M.</given-names></name> <name><surname>Boveia</surname> <given-names>A.</given-names></name> <name><surname>Caron</surname> <given-names>S.</given-names></name> <name><surname>Davies</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>The dark machines anomaly score challenge: benchmark data and model independent event classification for the large hadron collider</article-title>. <source>Sci. Post Phys</source>. arXiv [Preprint]. arXiv: 2105.14027.</citation>
</ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Amram</surname> <given-names>O.</given-names></name> <name><surname>Suarez</surname> <given-names>C. M.</given-names></name></person-group> (<year>2021</year>). <article-title>Tag N&#x00027; Train: a technique to train improved classifiers on unlabeled data</article-title>. <source>J. High Energ. Phys</source>. <volume>1</volume>:<fpage>153</fpage>. <pub-id pub-id-type="doi">10.1007/JHEP01(2021)153</pub-id></citation>
</ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>An</surname> <given-names>J.</given-names></name> <name><surname>Cho</surname> <given-names>S.</given-names></name></person-group> (<year>2015</year>). <article-title>Variational autoencoder based anomaly detection using reconstruction probability</article-title>. <source>Spec. Lect. IE</source> <volume>2</volume>:<fpage>1</fpage>.</citation>
</ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Andreassen</surname> <given-names>A.</given-names></name> <name><surname>Nachman</surname> <given-names>B.</given-names></name> <name><surname>Shih</surname> <given-names>D.</given-names></name></person-group> (<year>2020</year>). <article-title>Simulation assisted likelihood-free anomaly detection</article-title>. <source>Phys. Rev. D</source> <volume>101</volume>:<fpage>95004</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevD.101.095004</pub-id></citation>
</ref>
<ref id="B9">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Barrow</surname> <given-names>H. G.</given-names></name> <name><surname>Tenenbaum</surname> <given-names>J. M.</given-names></name> <name><surname>Bolles</surname> <given-names>R. C.</given-names></name> <name><surname>Wolf</surname> <given-names>H. C.</given-names></name></person-group> (<year>1977</year>). <article-title>&#x0201C;Parametric correspondence and Chamfer matching: two new techniques for image matching,&#x0201D;</article-title> in <source>Proceedings of the 5th International Joint Conference on Artificial Intelligence (KJCAI), Vol. 2</source> (<publisher-loc>San Francisco, CA</publisher-loc>: <publisher-name>Morgan Kaufmann Publishers Inc.</publisher-name>), <fpage>659</fpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Berg</surname> <given-names>R. V. d.</given-names></name> <name><surname>Hasenclever</surname> <given-names>L.</given-names></name> <name><surname>Tomczak</surname> <given-names>J. M.</given-names></name> <name><surname>Welling</surname> <given-names>M.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Sylvester normalizing flows for variational inference,&#x0201D;</article-title> in <source>Conference on Uncertainty in Artificial Intelligence (UAI) 2018</source> (<publisher-loc>Monterey, CA</publisher-loc>: <publisher-name>UAI</publisher-name>). Available online at: <ext-link ext-link-type="uri" xlink:href="http://auai.org/uai2018/proceedings/papers/156.pdf">http://auai.org/uai2018/proceedings/papers/156.pdf</ext-link></citation>
</ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Blance</surname> <given-names>A.</given-names></name> <name><surname>Spannowsky</surname> <given-names>M.</given-names></name> <name><surname>Waite</surname> <given-names>P.</given-names></name></person-group> (<year>2019</year>). <article-title>Adversarially-trained autoencoders for robust unsupervised new physics searches</article-title>. <source>J. High Energ. Phys</source>. <volume>10</volume>:<fpage>047</fpage>. <pub-id pub-id-type="doi">10.1007/JHEP10(2019)047</pub-id></citation>
</ref>
<ref id="B12">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Bortolato</surname> <given-names>B.</given-names></name> <name><surname>Dillon</surname> <given-names>B.</given-names></name> <name><surname>Kamenik</surname> <given-names>J. F.</given-names></name> <name><surname>Smolkovic</surname> <given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>Bump hunting in space</article-title>. <source>arXiv [Preprint]</source>. arXiv: 2103.06595. Available online at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2103.06595.pdf">https://arxiv.org/pdf/2103.06595.pdf</ext-link> (accessed March 11, 2021).</citation>
</ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bronstein</surname> <given-names>M. M.</given-names></name> <name><surname>Bruna</surname> <given-names>J.</given-names></name> <name><surname>LeCun</surname> <given-names>Y.</given-names></name> <name><surname>Szlam</surname> <given-names>A.</given-names></name> <name><surname>Vandergheynst</surname> <given-names>P.</given-names></name></person-group> (<year>2017</year>). <article-title>Geometric deep learning: going beyond Euclidean data</article-title>. <source>IEEE Signal Process. Mag</source>. <volume>34</volume>:<fpage>18</fpage>. <pub-id pub-id-type="doi">10.1109/MSP.2017.2693418</pub-id></citation>
</ref>
<ref id="B14">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Caron</surname> <given-names>S.</given-names></name> <name><surname>Hendriks</surname> <given-names>L.</given-names></name> <name><surname>Verheyen</surname> <given-names>R.</given-names></name></person-group> (<year>2021</year>). <article-title>Rare and different: Anomaly scores from a combination of likelihood and out-of-distribution models to detect new physics at the LHC</article-title>. <source>arXiv [Preprint]</source>. arXiv: 2106.10164. Available online at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2106.10164.pdf">https://arxiv.org/pdf/2106.10164.pdf</ext-link> (accessed December 22, 2021).</citation>
</ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cerri</surname> <given-names>O.</given-names></name> <name><surname>Nguyen</surname> <given-names>T. Q.</given-names></name> <name><surname>Pierini</surname> <given-names>M.</given-names></name> <name><surname>Spiropulu</surname> <given-names>M.</given-names></name> <name><surname>Vlimant</surname> <given-names>J.-R.</given-names></name></person-group> (<year>2019</year>). <article-title>Variational autoencoders for new physics mining at the large hadron collider</article-title>. <source>J. High Energ. Phys</source>. <volume>5</volume>:<fpage>36</fpage>. <pub-id pub-id-type="doi">10.1007/JHEP05(2019)036</pub-id></citation>
</ref>
<ref id="B16">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Cheng</surname> <given-names>T.</given-names></name> <name><surname>Arguin</surname> <given-names>J. -F.</given-names></name> <name><surname>Leissner-Martin</surname> <given-names>J.</given-names></name> <name><surname>Pilette</surname> <given-names>J.</given-names></name> <name><surname>Golling</surname> <given-names>T.</given-names></name></person-group> (<year>2020</year>). <source>Variational autoencoders for anomalous jet tagging</source>. arXiv [Preprint]. arXiv: 2007.01850. Available online at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2007.01850.pdf">https://arxiv.org/pdf/2007.01850.pdf</ext-link> (accessed February 15, 2021).</citation>
</ref>
<ref id="B17">
<citation citation-type="web"><person-group person-group-type="author"><collab>CMS-PAS-EXO-14-016</collab></person-group> (<year>2017</year>). <article-title>&#x0201C;MUSiC, a model unspecific search for new physics,&#x0201D;</article-title> in <source>PP Collisions at textitsqrts &#x0003D; 8textitmathrmTeV</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://cds.cern.ch/record/225665CMS-PAS-EXO-14-016">https://cds.cern.ch/record/225665CMS-PAS-EXO-14-016</ext-link></citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Collins</surname> <given-names>J. H.</given-names></name> <name><surname>Howe</surname> <given-names>K.</given-names></name> <name><surname>Nachman</surname> <given-names>B.</given-names></name></person-group> (<year>2018</year>). <article-title>Anomaly detection for resonant new physics with machine learning</article-title>. <source>Phys. Rev. Lett</source>. <volume>121</volume>:<fpage>241803</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevLett.121.241803</pub-id><pub-id pub-id-type="pmid">30608762</pub-id></citation></ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Collins</surname> <given-names>J. H.</given-names></name> <name><surname>Howe</surname> <given-names>K.</given-names></name> <name><surname>Nachman</surname> <given-names>B.</given-names></name></person-group> (<year>2019</year>). <article-title>Extending the search for new resonances with machine learning</article-title>. <source>Phys. Rev. D</source> <volume>99</volume>:<fpage>014038</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevD.99.014038</pub-id></citation>
</ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Collins</surname> <given-names>J. H.</given-names></name> <name><surname>Mart&#x000ED;n-Ramiro</surname> <given-names>P.</given-names></name> <name><surname>Nachman</surname> <given-names>B.</given-names></name> <name><surname>Shih</surname> <given-names>D.</given-names></name></person-group> (<year>2021</year>). <article-title>Comparing weak- and unsupervised methods for resonant anomaly detection</article-title>. <source>Eur. Phys. J. C</source> <volume>81</volume>:<fpage>617</fpage>. <pub-id pub-id-type="doi">10.1140/epjc/s10052-021-09389-x</pub-id></citation>
</ref>
<ref id="B21">
<citation citation-type="journal"><person-group person-group-type="author"><collab>D0 Collaboration</collab></person-group> (<year>2012</year>). <article-title>Model independent search for new phenomena in p<inline-formula><mml:math id="M29"><mml:mover accent="true"><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo>&#x00304;</mml:mo></mml:mover></mml:math></inline-formula> collisions at <inline-formula><mml:math id="M30"><mml:msqrt><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msqrt></mml:math></inline-formula> collisions at </article-title>. <source>Phys. Rev. D</source> <volume>85</volume>:<fpage>092015</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevD.85.092015</pub-id></citation>
</ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>D&#x00027;Agnolo</surname> <given-names>R. T.</given-names></name> <name><surname>Grosso</surname> <given-names>G.</given-names></name> <name><surname>Pierini</surname> <given-names>M.</given-names></name> <name><surname>Wulzer</surname> <given-names>A.</given-names></name> <name><surname>Zanetti</surname> <given-names>M.</given-names></name></person-group> (<year>2021</year>). <article-title>Learning multivariate new physics</article-title>. <source>Eur. Phys. J. C</source> <volume>81</volume>:<fpage>89</fpage>. <pub-id pub-id-type="doi">10.1140/epjc/s10052-021-08853-y</pub-id></citation>
</ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>D&#x00027;Agnolo</surname> <given-names>R. T.</given-names></name> <name><surname>Wulzer</surname> <given-names>A.</given-names></name></person-group> (<year>2019</year>). <article-title>Learning new physics from a machine</article-title>. <source>Phys. Rev. D</source> <volume>99</volume>:<fpage>015014</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevD.99.015014</pub-id></citation>
</ref>
<ref id="B24">
<citation citation-type="book"><person-group person-group-type="author"><collab>DarkMachines Community</collab></person-group> (<year>2020</year>). <source>Unsupervised-Hackathon</source>. <publisher-name>DarkMachines Community</publisher-name>.</citation>
</ref>
<ref id="B25">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>De Simone</surname> <given-names>A.</given-names></name> <name><surname>Jacques</surname> <given-names>T.</given-names></name></person-group> (<year>2019</year>). <article-title>Guiding new physics searches with unsupervised learning</article-title>. <source>Eur. Phys. J. C</source> <volume>79</volume>:<fpage>289</fpage>. <pub-id pub-id-type="doi">10.1140/epjc/s10052-019-6787-3</pub-id></citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dillon</surname> <given-names>B. M.</given-names></name> <name><surname>Faroughy</surname> <given-names>D. A.</given-names></name> <name><surname>Kamenik</surname> <given-names>J. F.</given-names></name> <name><surname>Szewc</surname> <given-names>M.</given-names></name></person-group> (<year>2020</year>). <article-title>Learning the latent structure of collider events</article-title>. <source>J. High Energ. Phys</source>. <volume>10</volume>:<fpage>206</fpage>. <pub-id pub-id-type="doi">10.1007/JHEP10(2020)206</pub-id></citation>
</ref>
<ref id="B27">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Duarte</surname> <given-names>J.</given-names></name> <name><surname>Vilmant</surname> <given-names>J. -R.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Graph neural networks for particle tracking and reconstruction,&#x0201D;</article-title> in <source>Artificial Intelligence for High Energy Physics</source>, eds P. Calafiura, D. Rousseau, and K. Terao (<publisher-loc>Hackensack, NJ</publisher-loc>: <publisher-name>World Scientific Publishing</publisher-name>). <pub-id pub-id-type="doi">10.1142/12200</pub-id><pub-id pub-id-type="pmid">33791596</pub-id></citation></ref>
<ref id="B28">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Durkan</surname> <given-names>C.</given-names></name> <name><surname>Bekasov</surname> <given-names>A.</given-names></name> <name><surname>Murray</surname> <given-names>I.</given-names></name> <name><surname>Papamakarios</surname> <given-names>G.</given-names></name></person-group> (<year>2019</year>). <article-title>Neural spline flows</article-title>. <source>Adv. Neural Inform. Process. Syst</source>. <volume>32</volume>, <fpage>7511</fpage>&#x02013;<lpage>7522</lpage>.</citation>
</ref>
<ref id="B29">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Fan</surname> <given-names>H.</given-names></name> <name><surname>Su</surname> <given-names>H.</given-names></name> <name><surname>Guibas</surname> <given-names>L. J.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;A point set generation network for 3D object reconstruction from a single image,&#x0201D;</article-title> in <source>2017 IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Honolulu, HA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2463</fpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2017.264</pub-id></citation>
</ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Farina</surname> <given-names>M.</given-names></name> <name><surname>Nakai</surname> <given-names>Y.</given-names></name> <name><surname>Shih</surname> <given-names>D.</given-names></name></person-group> (<year>2020</year>). <article-title>Searching for new physics with deep autoencoders</article-title>. <source>Phys. Rev. D</source> <volume>101</volume>:<fpage>075021</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevD.101.075021</pub-id></citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Finke</surname> <given-names>T.</given-names></name> <name><surname>Kr&#x000E4;mer</surname> <given-names>M.</given-names></name> <name><surname>Morandini</surname> <given-names>A.</given-names></name> <name><surname>M&#x000FC;ck</surname> <given-names>A.</given-names></name> <name><surname>Oleksiyuk</surname> <given-names>I.</given-names></name></person-group> (<year>2021</year>). <article-title>Autoencoders for unsupervised anomaly detection in high energy physics</article-title>. <source>J. High Energ. Phys</source>. <volume>6</volume>:<fpage>161</fpage>. <pub-id pub-id-type="doi">10.1007/JHEP06(2021)161</pub-id><pub-id pub-id-type="pmid">34736231</pub-id></citation></ref>
<ref id="B32">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Germain</surname> <given-names>M.</given-names></name> <name><surname>Gregor</surname> <given-names>K.</given-names></name> <name><surname>Murray</surname> <given-names>I.</given-names></name> <name><surname>Larochelle</surname> <given-names>H.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;MADE: masked autoencoder for distribution estimation,&#x0201D;</article-title> in <source>Proceedings of the 32nd International Conference on Machine Learning, Vol. 37 of Proceedings of Machine Learning Research</source>, eds F. Bach and D. Blei (<publisher-loc>Lille</publisher-loc>), <fpage>881</fpage>.<pub-id pub-id-type="pmid">33048754</pub-id></citation></ref>
<ref id="B33">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Gonski</surname> <given-names>J.</given-names></name> <name><surname>Lai</surname> <given-names>J.</given-names></name> <name><surname>Nachman</surname> <given-names>B.</given-names></name> <name><surname>Ochoa</surname> <given-names>I.</given-names></name></person-group> (<year>2021</year>). <article-title>High-dimensional anomaly detection with radiative return in <italic>e</italic><sup>&#x0002B;</sup><italic>e</italic><sup>&#x02212;</sup> collisions</article-title>. <source>arXiv [Preprint]</source>. arXiv: 2108.13451. Available online at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2108.13451.pdf">https://arxiv.org/pdf/2108.13451.pdf</ext-link> (accessed February 08, 2022).</citation>
</ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gross</surname> <given-names>E.</given-names></name> <name><surname>Vitells</surname> <given-names>O.</given-names></name></person-group> (<year>2010</year>). <article-title>Trial factors for the look elsewhere effect in high energy physics</article-title>. <source>Eur. Phys. J. C</source> <volume>70</volume>:<fpage>525</fpage>. <pub-id pub-id-type="doi">10.1140/epjc/s10052-010-1470-8</pub-id></citation>
</ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hajer</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>Y.-Y.</given-names></name> <name><surname>Liu</surname> <given-names>T.</given-names></name> <name><surname>Wang</surname> <given-names>H.</given-names></name></person-group> (<year>2020</year>). <article-title>Novelty detection meets collider physics</article-title>. <source>Phys. Rev. D</source> <volume>101</volume>:<fpage>076015</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevD.101.076015</pub-id></citation>
</ref>
<ref id="B36">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Hallin</surname> <given-names>A.</given-names></name> <name><surname>Isaacson</surname> <given-names>J.</given-names></name> <name><surname>Kasieczka</surname> <given-names>G.</given-names></name> <name><surname>Krause</surname> <given-names>C.</given-names></name> <name><surname>Nachman</surname> <given-names>B.</given-names></name> <name><surname>Quadfasel</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2021</year>). <source>Classifying anomalies through outer density estimation (CATHODE)</source>. arXiv [Preprint]. arXiv: 2109.00546. Available online at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2109.00546.pdf">https://arxiv.org/pdf/2109.00546.pdf</ext-link> (accessed October 29, 2021).</citation>
</ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Heimel</surname> <given-names>T.</given-names></name> <name><surname>Kasieczka</surname> <given-names>G.</given-names></name> <name><surname>Plehn</surname> <given-names>T.</given-names></name> <name><surname>Thompson</surname> <given-names>J. M.</given-names></name></person-group> (<year>2019</year>). <article-title>QCD or what?</article-title> <source>Sci. Post Phys</source>. <volume>6</volume>:<fpage>30</fpage>. <pub-id pub-id-type="doi">10.21468/SciPostPhys.6.3.030</pub-id></citation>
</ref>
<ref id="B38">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Higgins</surname> <given-names>I.</given-names></name> <name><surname>Matthey</surname> <given-names>L.</given-names></name> <name><surname>Pal</surname> <given-names>A.</given-names></name> <name><surname>Burgess</surname> <given-names>C. P.</given-names></name> <name><surname>Glorot</surname> <given-names>X.</given-names></name> <name><surname>Bovinick</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>&#x0201C;Beta-VAE: Learning basic visual concepts with a constrained variational framework,&#x0201D;</article-title> in <source>5th International Conference on Learning Representations</source> (<publisher-loc>Toulon</publisher-loc>). Available online at: <ext-link ext-link-type="uri" xlink:href="https://openreview.net/forum?id=Sy2fzU9gl">https://openreview.net/forum?id=Sy2fzU9gl</ext-link></citation>
</ref>
<ref id="B39">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Jawahar</surname> <given-names>P.</given-names></name> <name><surname>Pierini</surname> <given-names>M.</given-names></name></person-group> (<year>2021</year>). <source>mpp-hep/DarkFlow repository</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://github.com/mpp-hep/DarkFlow">https://github.com/mpp-hep/DarkFlow</ext-link></citation>
</ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kasieczka</surname> <given-names>G</given-names></name></person-group>. (<year>2021</year>). <article-title>The LHC olympics 2020: a community challenge for anomaly detection in high energy physics</article-title>. <source>Rep. Prog. Phys</source>. <volume>84</volume>:<fpage>124201</fpage>. <pub-id pub-id-type="doi">10.1088/1361-6633/ac36b9</pub-id><pub-id pub-id-type="pmid">34736231</pub-id></citation></ref>
<ref id="B41">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Khosa</surname> <given-names>C. K.</given-names></name> <name><surname>Sanz</surname> <given-names>V.</given-names></name></person-group> (<year>2020</year>). <publisher-name>Anomaly awareness</publisher-name>.</citation>
</ref>
<ref id="B42">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Kingma</surname> <given-names>D. P.</given-names></name> <name><surname>Ba</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;Adam: A method for stochastic optimization,&#x0201D;</article-title> in <source>3rd International Conference for Learning Representations</source> (<publisher-loc>San Diego, CA</publisher-loc>). arXiv [Preprint]. arXiv: 1412.6980.</citation>
</ref>
<ref id="B43">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Kingma</surname> <given-names>D. P.</given-names></name> <name><surname>Salimans</surname> <given-names>T.</given-names></name> <name><surname>Jozefowicz</surname> <given-names>R.</given-names></name> <name><surname>Chen</surname> <given-names>X.</given-names></name> <name><surname>Sutskever</surname> <given-names>I.</given-names></name> <name><surname>Welling</surname> <given-names>M.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Improving variational inference with inverse autoregressive flow,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems</source>, <volume>Vol. 29</volume>, eds D. Lee, M. Sugiyama, U. Luxburg, I. Guyon, and R. Garnett (<publisher-loc>Barcelona</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>). arXiv [Preprint]. arXiv: 1606.04934. Available online at: <ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2016/file/ddeebdeefdb7e7e7a697e1c3e3d8ef54-Paper.pdf">https://proceedings.neurips.cc/paper/2016/file/ddeebdeefdb7e7e7a697e1c3e3d8ef54-Paper.pdf</ext-link></citation>
</ref>
<ref id="B44">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Kingma</surname> <given-names>D. P.</given-names></name> <name><surname>Welling</surname> <given-names>M.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;Auto-encoding variational Bayes,&#x0201D;</article-title> in <source>2nd International Conference on Learning Representations, ICLR 2014</source> (<publisher-loc>Banff, AB</publisher-loc>).<pub-id pub-id-type="pmid">32176273</pub-id></citation></ref>
<ref id="B45">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kingma</surname> <given-names>D. P.</given-names></name> <name><surname>Welling</surname> <given-names>M.</given-names></name></person-group> (<year>2019</year>). <article-title>An introduction to variational autoencoders</article-title>. <source>Found. Trends Mach. Learn</source>. <volume>12</volume>:<fpage>307</fpage>. <pub-id pub-id-type="doi">10.1561/9781680836233</pub-id><pub-id pub-id-type="pmid">34918594</pub-id></citation></ref>
<ref id="B46">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Kipf</surname> <given-names>T. N.</given-names></name> <name><surname>Welling</surname> <given-names>M.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Semi-supervised classification with graph convolutional networks,&#x0201D;</article-title> in <source>5th International Conference on Learning Representations</source> (<publisher-loc>Toulon</publisher-loc>). Available online at: <ext-link ext-link-type="uri" xlink:href="https://openreview.net/forum?id=SJU4ayYgl">https://openreview.net/forum?id=SJU4ayYgl</ext-link></citation>
</ref>
<ref id="B47">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kobyzev</surname> <given-names>I.</given-names></name> <name><surname>Prince</surname> <given-names>S.</given-names></name> <name><surname>Brubaker</surname> <given-names>M.</given-names></name></person-group> (<year>2020</year>). <article-title>Normalizing flows: an introduction and review of current methods</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell</source>. <volume>43</volume>, <fpage>3964</fpage>&#x02013;<lpage>3979</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2020.2992934</pub-id><pub-id pub-id-type="pmid">32396070</pub-id></citation></ref>
<ref id="B48">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nachma</surname> <given-names>B.</given-names></name> <name><surname>Shih</surname> <given-names>D.</given-names></name></person-group> (<year>2020</year>). <article-title>Anomaly detection with density estimation</article-title>. <source>Phys. Rev. D</source>. <volume>101</volume>, <fpage>075042</fpage>.</citation>
</ref>
<ref id="B49">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nachman</surname> <given-names>B.</given-names></name> <name><surname>Shih</surname> <given-names>D.</given-names></name></person-group> (<year>2020</year>). <article-title>Anomaly detection with density estimation</article-title>. <source>Phys. Rev. D</source> <volume>101</volume>:<fpage>075042</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevD.101.075042</pub-id></citation>
</ref>
<ref id="B50">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Ostdiek</surname> <given-names>B</given-names></name></person-group>. (<year>2021</year>). <article-title>Deep set auto encoders for anomaly detection in particle physics</article-title>. <source>arXiv [Preprint]</source>. arXiv: 2109.01695. Available online at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/2109.01695.pdf">https://arxiv.org/pdf/2109.01695.pdf</ext-link> (accessed November 15, 2021).</citation>
</ref>
<ref id="B51">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Papamakarios</surname> <given-names>G.</given-names></name> <name><surname>Nalisnick</surname> <given-names>E.</given-names></name> <name><surname>Rezende</surname> <given-names>D. J.</given-names></name> <name><surname>Mohamed</surname> <given-names>S.</given-names></name> <name><surname>Lakshminarayanan</surname> <given-names>B.</given-names></name></person-group> (<year>2021</year>). <article-title>Normalizing flows for probabilistic modeling and inference</article-title>. <source>J. Mach. Learn. Res</source>. <volume>22</volume>, <fpage>1</fpage>&#x02013;<lpage>64</lpage>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://jmlr.org/papers/v22/19-1028.html">http://jmlr.org/papers/v22/19-1028.html</ext-link><pub-id pub-id-type="pmid">32200210</pub-id></citation></ref>
<ref id="B52">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Park</surname> <given-names>S. E.</given-names></name> <name><surname>Rankin</surname> <given-names>D.</given-names></name> <name><surname>Udrescu</surname> <given-names>S.-M.</given-names></name> <name><surname>Yunus</surname> <given-names>M.</given-names></name> <name><surname>Harris</surname> <given-names>P.</given-names></name></person-group> (<year>2020</year>). <article-title>Quasi Anomalous Knowledge: Searching for new physics with embedded knowledge</article-title>. <source>J. High Energ. Phys</source>. <volume>21</volume>:<fpage>30</fpage>. <pub-id pub-id-type="doi">10.1007/JHEP06(2021)030</pub-id></citation>
</ref>
<ref id="B53">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Paszke</surname> <given-names>A.</given-names></name> <name><surname>Gross</surname> <given-names>S.</given-names></name> <name><surname>Massa</surname> <given-names>F.</given-names></name> <name><surname>Lerer</surname> <given-names>A.</given-names></name> <name><surname>Bradbury</surname> <given-names>J.</given-names></name> <name><surname>Chanan</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>&#x0201C;PyTorch: an imperative style, high-performance deep learning library,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems, Vol. 32</source>, eds H. Wallach, H. Larochelle, A. Beygelzimer, F. d&#x00027;Alch&#x000E9;-Buc, E. Fox, and R. Garnett (<publisher-loc>Vancouver, BC</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>). Available online at: <ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2019/file/bdbca288fee7f92f2bfa9f7012727740-Paper.pdf">https://proceedings.neurips.cc/paper/2019/file/bdbca288fee7f92f2bfa9f7012727740-Paper.pdf</ext-link></citation>
</ref>
<ref id="B54">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pedregosa</surname> <given-names>F.</given-names></name> <name><surname>Varoquaux</surname> <given-names>G.</given-names></name> <name><surname>Gramfort</surname> <given-names>A.</given-names></name> <name><surname>Michel</surname> <given-names>V.</given-names></name> <name><surname>Thirion</surname> <given-names>B.</given-names></name> <name><surname>Grisel</surname> <given-names>O.</given-names></name> <etal/></person-group>. (<year>2011</year>). <article-title>Scikit-learn: machine learning in Python</article-title>. <source>J. Mach. Learn. Res</source>. <volume>12</volume>:<fpage>2825</fpage>.</citation>
</ref>
<ref id="B55">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Rezende</surname> <given-names>D.</given-names></name> <name><surname>Mohamed</surname> <given-names>S.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;Variational inference with normalizing flows,&#x0201D;</article-title> in <source>Proceedings of the 32nd International Conference on Machine Learning, Vol. 37</source>, eds F. Bach and D. Blei (<publisher-loc>Lille</publisher-loc>), <fpage>1530</fpage>.<pub-id pub-id-type="pmid">32200210</pub-id></citation></ref>
<ref id="B56">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Rezende</surname> <given-names>D. J.</given-names></name> <name><surname>Mohamed</surname> <given-names>S.</given-names></name> <name><surname>Wierstra</surname> <given-names>D.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;Stochastic backpropagation and approximate inference in deep generative models,&#x0201D;</article-title> in <source>Proceedings of the 31st International Conference on Machine Learning, Vol. 32 of Proceedings of Machine Learning Research</source> (<publisher-loc>Beijing</publisher-loc>), <fpage>1278</fpage>.</citation>
</ref>
<ref id="B57">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shlomi</surname> <given-names>J.</given-names></name> <name><surname>Battaglia</surname> <given-names>P.</given-names></name> <name><surname>Vlimant</surname> <given-names>J.-R.</given-names></name></person-group> (<year>2020</year>). <article-title>Graph neural networks in particle physics</article-title>. <source>Mach. Learn. Sci. Tech</source>. <volume>2</volume>:<fpage>21001</fpage>. <pub-id pub-id-type="doi">10.1088/2632-2153/abbf9a</pub-id><pub-id pub-id-type="pmid">33791596</pub-id></citation></ref>
<ref id="B58">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sirunyan</surname> <given-names>A. M.</given-names></name> <name><surname>Tumasyan</surname> <given-names>A.</given-names></name> <name><surname>Adam</surname> <given-names>W.</given-names></name> <name><surname>Ambrogi</surname> <given-names>F.</given-names></name> <name><surname>Arnold</surname> <given-names>B.</given-names></name> <name><surname>Bergauer</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Performance of the CMS Level-1 trigger in proton-proton collisions at <inline-formula><mml:math id="M31"><mml:msqrt><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msqrt></mml:math></inline-formula> = 13 TeV</article-title>. <source>J. Instrum</source>. <volume>15</volume>:<fpage>P10017</fpage>. <pub-id pub-id-type="doi">10.1088/1748-0221/15/10/P10017</pub-id></citation>
</ref>
<ref id="B59">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tabak</surname> <given-names>E. G.</given-names></name> <name><surname>Turner</surname> <given-names>C. V.</given-names></name></person-group> (<year>2013</year>). <article-title>A family of nonparametric density estimation algorithms</article-title>. <source>Commun. Pure Appl. Math</source>. <volume>66</volume>:<fpage>145</fpage>. <pub-id pub-id-type="doi">10.1002/cpa.21423</pub-id></citation>
</ref>
<ref id="B60">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tabak</surname> <given-names>E. G.</given-names></name> <name><surname>Vanden-Eijnden</surname> <given-names>E.</given-names></name></person-group> (<year>2010</year>). <article-title>Density estimation by dual ascent of the log-likelihood</article-title>. <source>Commun. Math. Sci</source>. <volume>8</volume>:<fpage>217</fpage>. <pub-id pub-id-type="doi">10.4310/CMS.2010.v8.n1.a11</pub-id></citation>
</ref>
<ref id="B61">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Tomczak</surname> <given-names>J. M.</given-names></name> <name><surname>Welling</surname> <given-names>M.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Improving variational auto-encoders using convex combination linear inverse autoregressive flow,&#x0201D;</article-title> in <source>Benelearn 2017</source> (<publisher-loc>Eindhoven</publisher-loc>). Available online at: <ext-link ext-link-type="uri" xlink:href="http://wwwis.win.tue.nl/&#x0007E;benelearn2017/">http://wwwis.win.tue.nl/&#x0007E;benelearn2017/</ext-link></citation>
</ref>
<ref id="B62">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Trocino</surname> <given-names>D</given-names></name></person-group>. (<year>2014</year>). <article-title>The CMS high level trigger</article-title>. <source>J. Phys. Conf. Ser</source>. <volume>513</volume>:<fpage>012036</fpage>. <pub-id pub-id-type="doi">10.1088/1742-6596/513/1/012036</pub-id></citation>
</ref>
<ref id="B63">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Weisser</surname> <given-names>C.</given-names></name> <name><surname>Williams</surname> <given-names>M.</given-names></name></person-group> (<year>2016</year>). <article-title>Machine learning and multivariate goodness of fit</article-title>. <source>arXiv [Preprint]</source>. arXiv: 1612.07186. Available online at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/pdf/1612.07186.pdf">https://arxiv.org/pdf/1612.07186.pdf</ext-link> (accessed December 20, 2016).</citation>
</ref>
<ref id="B64">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Jonathon</surname> <given-names>H.</given-names></name> <name><surname>Adam</surname> <given-names>P. -B.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;FSPool: Learning set representations with featurewise sort pooling,&#x0201D;</article-title> in <source>8th International Conference on Learning Representations</source> (<publisher-loc>Addis Ababa</publisher-loc>). Available online at: <ext-link ext-link-type="uri" xlink:href="https://openreview.net/forum?id=HJgBA2VYwH">https://openreview.net/forum?id=HJgBA2VYwH</ext-link></citation>
</ref>
<ref id="B65">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Zheng</surname> <given-names>G.</given-names></name> <name><surname>Yang</surname> <given-names>Y.</given-names></name> <name><surname>Carbonell</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Convolutional normalizing flows,&#x0201D;</article-title> in <source>ICML 2018 Workshop on Theoretical Foundations and Applications of Deep Generative Models</source> (<publisher-loc>Stockholm</publisher-loc>). Available online at: <ext-link ext-link-type="uri" xlink:href="https://drive.google.com/file/d/1_40TktDTeKG2eEpbE-D_hQcb3oSj5ebQ/view">https://drive.google.com/file/d/1_40TktDTeKG2eEpbE-D_hQcb3oSj5ebQ/view</ext-link></citation>
</ref>
<ref id="B66">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>J.</given-names></name> <name><surname>Cui</surname> <given-names>G.</given-names></name> <name><surname>Hu</surname> <given-names>S.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Yang</surname> <given-names>C.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Graph neural networks: a review of methods and applications</article-title>. <source>AI Open</source> <volume>1</volume>:<fpage>57</fpage>. <pub-id pub-id-type="doi">10.1016/j.aiopen.2021.01.001</pub-id></citation>
</ref>
</ref-list>
<fn-group>
<fn id="fn0001"><p><sup>1</sup>We use a Cartesian coordinate system with the <italic>z</italic> axis oriented along the beam axis, the <italic>x</italic> axis on the horizontal plane, and the <italic>y</italic> axis oriented upward. The <italic>x</italic> and <italic>y</italic> axes define the transverse plane, while the <italic>z</italic> axis identifies the longitudinal direction. The azimuth angle &#x003D5; is computed with respect to the <italic>x</italic> axis. The polar angle &#x003B8; is used to compute the pseudorapidity &#x003B7; &#x0003D; &#x02212;log[tan(&#x003B8;/2)]. The transverse momentum (<italic>p</italic><sub>T</sub>) is the projection of the particle momentum on the (<italic>x</italic>, <italic>y</italic>) plane. We fix units such that <italic>c</italic> &#x0003D; &#x0210F; &#x0003D; 1.</p></fn>
<fn id="fn0002"><p><sup>2</sup><ext-link ext-link-type="uri" xlink:href="https://arc.wpi.edu/computing/hpc-clusters/">https://arc.wpi.edu/computing/hpc-clusters/</ext-link></p></fn>
</fn-group>
</back>
</article> 