<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Big Data</journal-id>
<journal-title>Frontiers in Big Data</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Big Data</abbrev-journal-title>
<issn pub-type="epub">2624-909X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">598927</article-id>
<article-id pub-id-type="doi">10.3389/fdata.2020.598927</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Big Data</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Distance-Weighted Graph Neural Networks on FPGAs for Real-Time Particle Reconstruction in High Energy Physics</article-title>
<alt-title alt-title-type="left-running-head">Iiyama et&#x0020;al.</alt-title>
<alt-title alt-title-type="right-running-head">Graph Neural Networks on FPGAs</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Iiyama</surname>
<given-names>Yutaro</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x002a;</xref>
<uri xlink:href="http://loop.frontiersin.org/people/1062216/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Cerminara</surname>
<given-names>Gianluca</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gupta</surname>
<given-names>Abhijay</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kieseler</surname>
<given-names>Jan</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="http://loop.frontiersin.org/people/1104101/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Loncar</surname>
<given-names>Vladimir</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Pierini</surname>
<given-names>Maurizio</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="http://loop.frontiersin.org/people/706738/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Qasim</surname>
<given-names>Shah Rukh</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1168271/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Rieger</surname>
<given-names>Marcel</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Summers</surname>
<given-names>Sioni</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Van Onsem</surname>
<given-names>Gerrit</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wozniak</surname>
<given-names>Kinga Anna</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ngadiuba</surname>
<given-names>Jennifer</given-names>
</name>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Di Guglielmo</surname>
<given-names>Giuseppe</given-names>
</name>
<xref ref-type="aff" rid="aff7">
<sup>7</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Duarte</surname>
<given-names>Javier</given-names>
</name>
<xref ref-type="aff" rid="aff8">
<sup>8</sup>
</xref>
<uri xlink:href="http://loop.frontiersin.org/people/1041469/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Harris</surname>
<given-names>Philip</given-names>
</name>
<xref ref-type="aff" rid="aff9">
<sup>9</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Rankin</surname>
<given-names>Dylan</given-names>
</name>
<xref ref-type="aff" rid="aff9">
<sup>9</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Jindariani</surname>
<given-names>Sergo</given-names>
</name>
<xref ref-type="aff" rid="aff10">
<sup>10</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Mia</given-names>
</name>
<xref ref-type="aff" rid="aff10">
<sup>10</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Pedro</surname>
<given-names>Kevin</given-names>
</name>
<xref ref-type="aff" rid="aff10">
<sup>10</sup>
</xref>
<uri xlink:href="http://loop.frontiersin.org/people/992199/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Tran</surname>
<given-names>Nhan</given-names>
</name>
<xref ref-type="aff" rid="aff10">
<sup>10</sup>
</xref>
<xref ref-type="aff" rid="aff11">
<sup>11</sup>
</xref>
<uri xlink:href="http://loop.frontiersin.org/people/1082126/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kreinar</surname>
<given-names>Edward</given-names>
</name>
<xref ref-type="aff" rid="aff12">
<sup>12</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wu</surname>
<given-names>Zhenbin</given-names>
</name>
<xref ref-type="aff" rid="aff13">
<sup>13</sup>
</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<label>
<sup>1</sup>
</label>International Center for Elementary Particle Physics, University of Tokyo, <addr-line>Tokyo</addr-line>, <country>Japan</country>
</aff>
<aff id="aff2">
<label>
<sup>2</sup>
</label>Experimental Physics Department, European Organization for Nuclear Research (CERN), <addr-line>Geneva</addr-line>, <country>Switzerland</country>
</aff>
<aff id="aff3">
<label>
<sup>3</sup>
</label>Institute of Physics Belgrade, <addr-line>Belgrade</addr-line>, <country>Serbia</country>
</aff>
<aff id="aff4">
<label>
<sup>4</sup>
</label>Manchester Metropolitan University, <addr-line>Manchester</addr-line>, <country>United&#x0020;Kingdom</country>
</aff>
<aff id="aff5">
<label>
<sup>5</sup>
</label>University of Vienna, <addr-line>Vienna</addr-line>, <country>Austria</country>
</aff>
<aff id="aff6">
<label>
<sup>6</sup>
</label>Department of Physics, Math and Astronomy, California Institute of Technology, <addr-line>Pasadena</addr-line>, <addr-line>CA</addr-line>, <country>United&#x0020;States</country>
</aff>
<aff id="aff7">
<label>
<sup>7</sup>
</label>Department of Computer Science, Columbia University, <addr-line>New York</addr-line>, <addr-line>NY</addr-line>, <country>United&#x0020;States</country>
</aff>
<aff id="aff8">
<label>
<sup>8</sup>
</label>Department of Physics, University of California, San Diego, <addr-line>San Diego</addr-line>, <addr-line>CA</addr-line>, <country>United&#x0020;States</country>
</aff>
<aff id="aff9">
<label>
<sup>9</sup>
</label>Laboratory for Nuclear Science, Massachusetts Institute of Technology, <addr-line>Cambridge</addr-line>, <addr-line>MA</addr-line>, <country>United&#x0020;States</country>
</aff>
<aff id="aff10">
<label>
<sup>10</sup>
</label>Department of Physics and Astronomy, Purdue university, West Lafayette, <addr-line>IL</addr-line>, <country>United&#x0020;States</country>
</aff>
<aff id="aff11">
<label>
<sup>11</sup>
</label>Department of Electrical and Computer Engineering, Northwestern University, <addr-line>Evanston</addr-line>, <addr-line>IL</addr-line>, <country>United&#x0020;States</country>
</aff>
<aff id="aff12">
<label>
<sup>12</sup>
</label>HawkEye360, <addr-line>Herndon</addr-line>, <addr-line>VA</addr-line>, <country>United&#x0020;States</country>
</aff>
<aff id="aff13">
<label>
<sup>13</sup>
</label>Department of Physics, University of Illinois at Chicago, <addr-line>Chicago</addr-line>, <addr-line>IL</addr-line>, <country>United&#x0020;States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/676757/overview">Daniele D&#x2019;Agostino</ext-link>, National Research Council (CNR), Italy</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/695220/overview">Anushree Ghosh</ext-link>, University of Padua, Italy</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/680438/overview">Alexander Radovic</ext-link>, Borealis AI, Canada</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Yutaro Iiyama, <email>yutaro.iiyama@cern.ch</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Big Data and AI in High Energy Physics, a section of the journal Frontiers in Big&#x0020;Data</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>12</day>
<month>01</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2020</year>
</pub-date>
<volume>3</volume>
<elocation-id>598927</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>08</month>
<year>2020</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>10</month>
<year>2020</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2021 Iiyama, Cerminara, Gupta, Kieseler, Loncar, Pierini, Qasim, Rieger, Summers, Van Onsem, Wozniak, Ngadiuba, Di Guglielmo, Duarte, Harris, Rankin, Jindariani, Liu, Pedro, Tran, Kreinar and Wu.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Iiyama, Cerminara, Gupta, Kieseler, Loncar, Pierini, Qasim, Rieger, Summers, Van Onsem, Wozniak, Ngadiuba, Di Guglielmo, Duarte, Harris, Rankin, Jindariani, Liu, Pedro, Tran, Kreinar and Wu</copyright-holder>
<license xlink:href="http://Creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these&#x0020;terms.</p>
</license>
</permissions>
<abstract>
<p>Graph neural networks have been shown to achieve excellent performance for several crucial tasks in particle physics, such as charged particle tracking, jet tagging, and clustering. An important domain for the application of these networks is the FGPA-based first layer of real-time data filtering at the CERN Large Hadron Collider, which has strict latency and resource constraints. We discuss how to design distance-weighted graph networks that can be executed with a latency of less than one &#x03bc;s on an FPGA. To do so, we consider a representative task associated to particle reconstruction and identification in a next-generation calorimeter operating at a particle collider. We use a graph network architecture developed for such purposes, and apply additional simplifications to match the computing constraints of Level-1 trigger systems, including weight quantization. Using the hls4ml library, we convert the compressed models into firmware to be implemented on an FPGA. Performance of the synthesized models is presented both in terms of inference accuracy and resource&#x0020;usage.</p>
</abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>field-programmable gate arrays</kwd>
<kwd>fast inference</kwd>
<kwd>graph network</kwd>
<kwd>imaging calorimeter</kwd>
</kwd-group>
<counts>
<page-count count="0"/>
</counts>
</article-meta>
</front>
<body>
<sec id="s1">
<label>1.</label>
<title> Introduction</title>
<p>At the CERN Large Hadron Collider (LHC), high-energy physics (HEP) experiments collect signals generated by the particles produced in high-energy proton collisions that occur every 25 ns, when two proton beams cross. The readout from the detectors that capture the particles emerging from the collision is filtered by a real-time processing system, known as the <italic>trigger</italic>, that discards uninteresting collision events, based on a set of predefined algorithms. The trigger system is structured in two stages: a Level-1 trigger (L1T), implemented with custom electronics on-detector and field-programmable gate arrays (FPGAs); and a high-level trigger (HLT), consisting of a computer farm, possibly including co-processor accelerators like graphics processing units (GPUs) and FPGAs. Because of asynchronous event processing at the HLT, the accept/reject decision has to be reached with a typical latency of <inline-formula id="inf2">
<mml:math id="m1">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mtext>ms</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>. However, at the L1T, a decision must be taken within a fixed latency of <inline-formula id="inf3">
<mml:math id="m2">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mi>&#x03bc;</mml:mi>
<mml:mtext>s</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>. The main limitations are the synchronous, &#x201c;hard-deadline&#x201d; nature of the processing system and the limited size of the memory buffer for the data from each beam crossing.</p>
<p>While HLT algorithms have a complexity comparable to those used <italic>offline</italic> to produce the final physics results, a typical L1T algorithm consists of simpler rules based on coarser objects to satisfy the latency constraint. Consequently, the resolution of quantities computed at the L1T is typically poor compared to offline quantities. Recently, the successful deployment of the first machine learning (ML) L1T algorithm, based on a boosted decision tree (BDT), at the LHC (<xref ref-type="bibr" rid="B3">Acosta et&#x0020;al., 2018</xref>) has changed this tendency, raising interest in using ML inference as fast-to-execute approximations of complex algorithms with good accuracy. This first example consisted of a large, pre-computed table of input and output values implementing a BDT, which raises the question of how to deploy more complex architectures. This question motivated the creation of hls4ml (<xref ref-type="bibr" rid="B22">Duarte et&#x0020;al., 2018</xref>; <xref ref-type="bibr" rid="B38">Loncar et&#x0020;al., 2020</xref>), a library designed to facilitate the deployment of ML algorithms on FPGAs.</p>
<p>A typical hls4ml workflow begins with a neural network model that is implemented and trained using <sc>Keras</sc> (<xref ref-type="bibr" rid="B33">Keras, 2015</xref>), <sc>PyTorch</sc> (<xref ref-type="bibr" rid="B45">Paszke et&#x0020;al., 2019</xref>), or <sc>TensorFlow</sc> (<xref ref-type="bibr" rid="B6">Abadi et&#x0020;al., 2015</xref>). The trained model is passed to hls4ml, directly or through the <sc>ONNX</sc> (<xref ref-type="bibr" rid="B11">Bai et&#x0020;al., 2019</xref>) interface, and converted to C&#x002b;&#x002b; code that can be processed by a high-level synthesis (HLS) compiler to produce an FPGA firmware. By design, hls4ml targets low-latency applications. To this end, its design prioritizes all-on-chip implementations of the most common network components. Its functionality has been demonstrated with dense neural networks (DNNs) (<xref ref-type="bibr" rid="B22">Duarte et&#x0020;al., 2018</xref>), extended to also support BDTs (<xref ref-type="bibr" rid="B51">Summers et&#x0020;al., 2020</xref>). Extensions to convolutional and recurrent neural networks are in development. The library comes with handles to compress the model by quantization, up to binary and ternary precision (<xref ref-type="bibr" rid="B21">Di Guglielmo et&#x0020;al., 2020</xref>). Recently, support for <sc>QKeras</sc> (<xref ref-type="bibr" rid="B48">Qkeras, 2020</xref>) models has been added, in order to allow for quantization-aware training of models (<xref ref-type="bibr" rid="B19">Coelho et&#x0020;al., 2020</xref>). While the hls4ml applications go beyond HEP, its development has been driven by the LHC L1T use&#x0020;case.</p>
<p>Graph neural networks (GNNs) are among the complex architectures whose L1T implementations are in high demand, given the growing list of examples showing how well GNNs can deal with tasks related to HEP (<xref ref-type="bibr" rid="B27">Henrion et&#x0020;al., 2017</xref>; <xref ref-type="bibr" rid="B15">Choma et&#x0020;al., 2018</xref>; <xref ref-type="bibr" rid="B13">Abdughani et&#x0020;al., 2019</xref>; <xref ref-type="bibr" rid="B8">Arjona Mart&#x00ed;nez et&#x0020;al., 2019</xref>; <xref ref-type="bibr" rid="B30">Jin et&#x0020;al., 2019</xref>; <xref ref-type="bibr" rid="B31">Ju et&#x0020;al., 2019</xref>; <xref ref-type="bibr" rid="B47">Qasim et&#x0020;al., 2019b</xref>; <xref ref-type="bibr" rid="B13">Bernreuther et&#x0020;al., 2020</xref>; <xref ref-type="bibr" rid="B40">Moreno et&#x0020;al., 2020a</xref>; <xref ref-type="bibr" rid="B41">Moreno et&#x0020;al., 2020b</xref>; <xref ref-type="bibr" rid="B49">Qu and Gouskos, 2020</xref>; <xref ref-type="bibr" rid="B50">Shlomi et&#x0020;al., 2020</xref>). In fact, while the irregular geometry of a typical HEP detector complicates the use of computing vision techniques such as convolutional neural networks, GNNs can naturally deal with the sparse and irregular nature of HEP&#x0020;data.</p>
<p>In this work, we show how a graph model can be efficiently deployed on FPGAs to perform inference within <inline-formula id="inf4">
<mml:math id="m3">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mi>&#x03bc;</mml:mi>
<mml:mtext>s</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> for HEP-related problems. We consider the distance-weighted architecture <sc>GarNet</sc>, introduced in <xref ref-type="bibr" rid="B47">Qasim et&#x0020;al., (2019b)</xref>, which is designed to keep resource consumption under control by reducing as much as possible the number of operations. It has been demonstrated to perform well for a HEP-related task, namely particle reconstruction in a calorimeter. For these reasons, it represents a good candidate for our purpose. The firmware implementation of <sc>GarNet</sc> presented in this work has been included in hls4ml, representing the first graph-based algorithm available in the library.</p>
<p>We present a case study of a neural network algorithm based on <sc>GarNet</sc>, applied to a task of identifying the nature of an incoming particle and simultaneously estimating its energy from the energy deposition patterns in a simulated imaging calorimeter. The inference accuracy of the firmware implementation of the algorithm is compared against its offline counterpart running on processors (CPUs and GPUs). Latency and resource utilization of the translated FPGA firmware are reported, along with a discussion on their implications for real-world usage of similar algorithms.</p>
<p>This paper is structured as follows. In <xref ref-type="sec" rid="s2">Section 2</xref>, we briefly recount related work. <xref ref-type="sec" rid="s3">Section 3</xref> defines the main problem by outlining the challenges in designing a graph network compatible with L1T latency and resource constraints. <xref ref-type="sec" rid="s4">Section 4</xref> describes how <sc>GarNet</sc> addresses these challenges, and introduces a simplified form of the algorithm with a better affinity to a firmware implementation. The case study using a calorimeter simulation is presented in <xref ref-type="sec" rid="s5">Section 5</xref>, with detailed descriptions of the task setup, model architecture, training results, and the summary of FPGA firmware synthesis. Finally, conclusions are given in <xref ref-type="sec" rid="s6">Section&#x0020;6</xref>.</p>
</sec>
<sec id="s2">
<label>2.</label>
<title> Related Work</title>
<p>Graph neural networks are gaining interest in HEP applications, mainly due to their intrinsic advantage in dealing with sparse input datasets, which are very common in HEP. A recent review of applications of GNNs to HEP problems may be found in <xref ref-type="bibr" rid="B50">Shlomi et&#x0020;al., (2020)</xref>. In particular, dynamic GNNs (<xref ref-type="bibr" rid="B47">Qasim et&#x0020;al., 2019b</xref>; <xref ref-type="bibr" rid="B55">Wang et&#x0020;al., 2019</xref>; <xref ref-type="bibr" rid="B24">Gray et&#x0020;al., 2020</xref>; <xref ref-type="bibr" rid="B34">Kieseler, 2020</xref>) are relevant for particle reconstruction tasks, such as tracking (<xref ref-type="bibr" rid="B31">Ju et&#x0020;al., 2019</xref>) and calorimetry (<xref ref-type="bibr" rid="B47">Qasim et&#x0020;al., 2019b</xref>).</p>
<p>Development of ML models deployable to FPGA-based L1T systems is helped by tools for automatic network-to-circuit conversion such as hls4ml. Using hls4ml, several solutions for HEP-specific tasks (e.g., jet tagging) have been provided (<xref ref-type="bibr" rid="B22">Duarte et&#x0020;al., 2018</xref>; <xref ref-type="bibr" rid="B19">Coelho et&#x0020;al., 2020</xref>; <xref ref-type="bibr" rid="B21">Di Guglielmo et&#x0020;al., 2020</xref>; <xref ref-type="bibr" rid="B51">Summers et&#x0020;al., 2020</xref>), exploiting models with simpler architectures than what is shown here. This tool has been applied extensively for tasks in the HL-LHC upgrade of the CMS L1T system, including an autoencoder for anomaly detection, and DNNs for muon energy regression and identification, tau lepton identification, and vector boson fusion event classification (<xref ref-type="bibr" rid="B18">CMS Collaboration, 2020</xref>). However, prior to this work, GNN models had not yet been supported by hls4ml. To the best of our knowledge, the present work is the first demonstration of GNN inference on FPGAs for a HEP application.</p>
<p>Outside of HEP, hardware and firmware acceleration of GNN inference, and graph processing in general, has been an active area of study in recent years, motivated by the intrinsic inefficiencies of CPUs and GPUs when dealing with graph data (<xref ref-type="bibr" rid="B14">Besta et&#x0020;al., 2019</xref>; <xref ref-type="bibr" rid="B25">Gui et&#x0020;al., 2019</xref>). <xref ref-type="bibr" rid="B42">Nurvitadhi et&#x0020;al., 2014</xref>; <xref ref-type="bibr" rid="B43">Ozdal et&#x0020;al., 2016</xref>; <xref ref-type="bibr" rid="B10">Auten et&#x0020;al., 2020</xref>; <xref ref-type="bibr" rid="B23">Geng et&#x0020;al., 2020</xref>; <xref ref-type="bibr" rid="B36">Kiningham et&#x0020;al., 2020</xref>; <xref ref-type="bibr" rid="B57">Yan et&#x0020;al., 2020</xref>; <xref ref-type="bibr" rid="B58">Zeng and Prasanna, 2020</xref> describe examples of GNN acceleration architectures. <xref ref-type="bibr" rid="B10">Auten et&#x0020;al., 2020</xref>; <xref ref-type="bibr" rid="B23">Geng et&#x0020;al., 2020</xref>; <xref ref-type="bibr" rid="B57">Yan et&#x0020;al., 2020</xref>; <xref ref-type="bibr" rid="B58">Zeng and Prasanna, 2020</xref>. are specific to the graph convolutional network (GCN) (<xref ref-type="bibr" rid="B37">Kipf and Welling, 2017</xref>), while the graph inference processor (GRIP) architecture in <xref ref-type="bibr" rid="B36">Kiningham et&#x0020;al., (2020)</xref> is efficient across a wide range of GNN models. All five architectures are designed for processing graphs with millions of vertices under a latency constraint (10&#x2013;1,000<inline-formula id="inf5">
<mml:math id="m4">
<mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mi>&#x03bc;</mml:mi>
<mml:mtext>s</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> or more) that is less stringent than in the HEP L1T environment (less than 1<inline-formula id="inf6">
<mml:math id="m5">
<mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mi>&#x03bc;</mml:mi>
<mml:mtext>s</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>), and are thus not directly applicable to our use case. <xref ref-type="bibr" rid="B42">Nurvitadhi et&#x0020;al., (2014)</xref> and <xref ref-type="bibr" rid="B43">Ozdal et&#x0020;al., (2016)</xref> present frameworks that automatically generate register-transfer level (RTL) implementations for graph computations according to user-defined configurations. While these frameworks are applicable to various graph processing tasks, they require the user to specify the design in highly specific nonstandard format, rather than a standard serialized ML model as in our implementation.</p>
</sec>
<sec id="s3">
<label>3.</label>
<title> General Requirements and Challenges</title>
<p>In the framework of <xref ref-type="bibr" rid="B12">Battaglia et&#x0020;al., (2018)</xref>, a graph is a triplet <inline-formula id="inf7">
<mml:math id="m6">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>&#x03C5;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">&#x2130;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="script">u</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf8">
<mml:math id="m7">
<mml:mi mathvariant="normal">&#x03C5;</mml:mi>
</mml:math>
</inline-formula> is a set of entities (vertices) each possessing some attributes in a fixed format, <inline-formula id="inf9">
<mml:math id="m8">
<mml:mi mathvariant="normal">&#x2130;</mml:mi>
</mml:math>
</inline-formula> is a set of pairwise relations (edges) between the elements in <inline-formula id="inf10">
<mml:math id="m9">
<mml:mi mathvariant="normal">&#x03C5;</mml:mi>
</mml:math>
</inline-formula>, potentially possessing some additional attributes, and <inline-formula id="inf11">
<mml:math id="m10">
<mml:mi mathvariant="script">u</mml:mi>
</mml:math>
</inline-formula> are global (graph-level) attributes. While a GNN can be any neural network that acts on such graphs, in this work we specifically consider graph networks (GN) (<xref ref-type="bibr" rid="B12">Battaglia et&#x0020;al., 2018</xref>), i.e.,&#x0020;architectures that consist of repeatable graph-to-graph mapping blocks (GN blocks). Each GN block performs some combination of operations such as edge feature transformation, aggregation of neighbors&#x2019; features at each vertex, vertex feature transformation, global aggregation of edge and vertex features, and global feature transformation. A GN takes a graph as an input sample, where the cardinality of <inline-formula id="inf12">
<mml:math id="m11">
<mml:mi>&#x03C5;</mml:mi>
</mml:math>
</inline-formula> may differ sample to sample, and infers its properties, which may be anything from a global scalar, such as a classification label of the sample, to new edge attributes.</p>
<p>To be usable as a part of an LHC L1T system, an algorithm must execute within <inline-formula id="inf13">
<mml:math id="m12">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mi>&#x03bc;</mml:mi>
<mml:mtext>s</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> and have the throughput to accept all inputs from each beam crossing every 25<inline-formula id="inf14">
<mml:math id="m13">
<mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mtext>ns</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>. Time-multiplexing, whereby <italic>N</italic> copies of the algorithm accept inputs from <italic>N</italic> different beam crossings, may be used to decrease the throughput requirement by a factor of <italic>N</italic>. Additionally, there is a practical constraint that the firmware implementation should fit in the FPGA resources of the system, i.e.,&#x0020;utilize the resources such as digital signal processing units (DSPs), look-up tables (LUTs), flip-flips (FFs), and block RAM (BRAM) within the limits of chips available on the market. Satisfying these requirements with a GNN can be challenging for multiple reasons listed below.<list list-type="bullet">
<list-item>
<p>Model depth: Within each GN block, vertices exchange information with other directly connected vertices or with global attributes. Therefore, to expand the receptive field of each vertex beyond the nearest neighbors, multiple GN blocks must be repeated in the network. Given that various transformations within each GN block are often themselves multilayer perceptrons (MLPs), GNN models tend to be quite deep. Deep networks go against the latency requirement, as each perceptron layer uses at least one clock cycle on an FPGA under a straightforward implementation, and also against the resource usage requirement, because MLPs utilize multiplications heavily.</p>
</list-item>
<list-item>
<p>Input size: Typically, for problems where the application of GNNs is interesting, the cardinality of <inline-formula id="inf15">
<mml:math id="m14">
<mml:mi mathvariant="normal">&#x03C5;</mml:mi>
</mml:math>
</inline-formula> is at least <inline-formula id="inf16">
<mml:math id="m15">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Even with the high degree of parallelism of FPGAs, due to finiteness of the compute resource, such large input will have to be processed serially to a certain extent, increasing the latency and the interval before a new input can be accepted, known as the initiation interval (II). Longer IIs lead to lower throughput values.</p>
</list-item>
<list-item>
<p>Memory usage: Related to the problem of the input size, if the algorithm requires temporary retention of features for all vertices or edges, memory usage may be prohibitive for an FPGA firmware implementation.</p>
</list-item>
<list-item>
<p>Memory access pattern: Except for certain cases, algorithms that have both <inline-formula id="inf17">
<mml:math id="m16">
<mml:mi>&#x03C5;</mml:mi>
</mml:math>
</inline-formula> and <inline-formula id="inf18">
<mml:math id="m17">
<mml:mi mathvariant="normal">&#x2130;</mml:mi>
</mml:math>
</inline-formula> in the input usually require random memory access, for example when reading or writing features of vertices at the ends of the edges. This poses a challenge in FPGA firmware design not only because it implies that there needs to be a large enough memory bank to store all vertex and/or edge data, but also because random memory access itself is a costly operation (<xref ref-type="bibr" rid="B14">Besta et&#x0020;al., 2019</xref>). The exceptions include when <inline-formula id="inf19">
<mml:math id="m18">
<mml:mi mathvariant="normal">&#x2130;</mml:mi>
</mml:math>
</inline-formula> is trivial (<inline-formula id="inf20">
<mml:math id="m19">
<mml:mrow>
<mml:mi mathvariant="normal">&#x2130;</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mo>&#x2205;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> or when the graph is complete) and when all samples have an identical graph topology. In such cases, the memory access pattern of the algorithm is known at compile time and therefore can be statically scheduled in the FPGA firmware.</p>
</list-item>
</list>
</p>
<p>The case of <inline-formula id="inf21">
<mml:math id="m20">
<mml:mrow>
<mml:mi mathvariant="normal">&#x2130;</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mo>&#x2205;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is a rather extreme solution to the last challenge, but it is also attractive in terms of memory usage. In fact, even without explicit input edge features, a GNN can infer regional and non-local properties of the graph by globally gathering the vertex features and then scattering the gathered information back to the vertices. This information flow can also be mediated by a learnable attention mechanism (<xref ref-type="bibr" rid="B54">Veli&#x010d;kovi&#x0107; et&#x0020;al., 2018</xref>). The attention mechanism suppresses information from vertices that are considered unimportant, effectively forming &#x201c;soft&#x201d; edges among the unsuppressed vertices.</p>
<p>In the next section, we study a GNN architecture with these exact properties, then discuss the modifications to the architecture to make it suitable for an FPGA firmware implementation.</p>
</sec>
<sec id="s4">
<label>4.</label>
<title> A Simplified GARNET Layer in the HLS4ML Framework</title>
<p>In this work, we consider <sc>GarNet</sc> (<xref ref-type="bibr" rid="B47">Qasim et&#x0020;al., 2019b</xref>) as a specific example of GNN. A <sc>GarNet</sc> layer is a GN block that takes as input a set of <italic>V</italic> vertices, each possessing <inline-formula id="inf22">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> features, and returns the same set of vertices with <inline-formula id="inf23">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>out</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> features. In a <sc>GarNet</sc> layer, <inline-formula id="inf24">
<mml:math id="m23">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> features of each vertex are encoded into an internal representation and gathered at <italic>S aggregators</italic>. A distance parameter between each of the aggregators and vertices is also computed from the vertex attributes. Information gathered at the aggregators are then sent back to individual vertices and decoded into <inline-formula id="inf25">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>out</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> features. Communications between the vertices and aggregators are weighted by a decreasing function of the distance parameter, implementing an attention mechanism that allows the network to learn a dynamic, nontrivial graph structure from the vertex input&#x0020;alone.</p>
<p>The original <sc>GarNet</sc> algorithm, while already using less compute and memory resource than other similar GNN architectures in <xref ref-type="bibr" rid="B47">Qasim et&#x0020;al., (2019b)</xref> and <xref ref-type="bibr" rid="B55">Wang et&#x0020;al., (2019)</xref>, is still challenging to implement as fast and high-throughput FPGA firmware. The biggest problem arises from the use of the input feature vector as a part of the input to the decoder, which requires retention of the input data until the last steps of the algorithm. An immediate consequence of this requirement is a longer II, because processing of new samples cannot start while the input data for the current sample is still in use. Furthermore, the input feature vector is already used to compute the distance parameter as well as the internal representation of each vertex, and therefore a reuse of the input in the decoder creates a complex data flow, restricting the options for pipelining the algorithm.</p>
<p>We therefore designed a modified <sc>GarNet</sc> algorithm with a simplified processing flow:<list list-type="bullet">
<list-item>
<p>Input transformation (<xref ref-type="fig" rid="F1">Figures 1A,B</xref>): An encoder network converts the features <inline-formula id="inf26">
<mml:math id="m25">
<mml:mrow>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>j</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> of the <inline-formula id="inf27">
<mml:math id="m26">
<mml:mrow>
<mml:msup>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mtext>th</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> vertex <inline-formula id="inf28">
<mml:math id="m27">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>V</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> into an internal <italic>learned representation</italic> vector <inline-formula id="inf29">
<mml:math id="m28">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>LR</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. In parallel, another network (distance calculator) also acts on <inline-formula id="inf30">
<mml:math id="m29">
<mml:mrow>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and computes the distance parameters <inline-formula id="inf31">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> between the vertices and the <italic>S</italic> aggregators. Implicitly, this means that a complete bipartite graph with <inline-formula id="inf32">
<mml:math id="m31">
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> edges is built from <inline-formula id="inf33">
<mml:math id="m32">
<mml:mrow>
<mml:mi mathvariant="normal">&#x03C5;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf34">
<mml:math id="m33">
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf35">
<mml:math id="m34">
<mml:mrow>
<mml:mi mathvariant="script">S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the set of aggregators (<xref ref-type="fig" rid="F1">Figure&#x0020;1B</xref>). The encoder and distance calculator networks are both single-layer perceptrons with linear activation functions, so one can write them as linear transformations</p>
</list-item>
</list>
<disp-formula id="e1">
<mml:math id="m35">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x003d;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
<mml:mrow>
<mml:msubsup>
<mml:mi>w</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x2009;</mml:mo>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mstyle>
<mml:mo>&#x002b;</mml:mo>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math> <label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x003d;</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
<mml:msub>
<mml:mi>&#x03b1;</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2009;</mml:mo>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>j</mml:mi>
</mml:msubsup>
<mml:mo>&#x002b;</mml:mo>
<mml:msub>
<mml:mi>&#x03b2;</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
<mml:mo>&#x00a0;</mml:mo>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math> <label>(2)</label>
</disp-formula>where <inline-formula id="inf36">
<mml:math id="m37">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>w</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf37">
<mml:math id="m38">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>&#x03b1;</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x03b2;</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> are the kernels and biases of the encoder and distance calculator networks, respectively.<list list-type="bullet">
<list-item>
<p>Aggregation (<xref ref-type="fig" rid="F1">Figure&#x0020;1C</xref>): The learned representation vectors <inline-formula id="inf38">
<mml:math id="m39">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> of the vertices are weighted by a potential function <inline-formula id="inf39">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x003d;</mml:mo>
<mml:mtext>exp</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and averaged across the vertices. In other words, the <italic>i</italic>th averaged feature <inline-formula id="inf40">
<mml:math id="m41">
<mml:mrow>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> of aggregator <italic>a</italic> is written&#x0020;as</p>
</list-item>
</list>
<disp-formula id="e3">
<mml:math id="m42">
<mml:mrow>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x003d;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>V</mml:mi>
</mml:munderover>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>The factor <inline-formula id="inf41">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> in the denominator is the maximum possible value for the vertex multiplicity <italic>V</italic> (as <italic>V</italic> may have a different value for each input sample). Through this normalization by a common factor, the information about the size of the sample (cardinality of <inline-formula id="inf42">
<mml:math id="m44">
<mml:mi>&#x03C5;</mml:mi>
</mml:math>
</inline-formula>) is effectively encoded into&#x0020;<inline-formula id="inf43">
<mml:math id="m45">
<mml:mrow>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>.<list list-type="bullet">
<list-item>
<p>Output transformation (<xref ref-type="fig" rid="F1">Figures 1D,E</xref>): The aggregated features are sent back to the vertices using the same weights&#x0020;as</p>
</list-item>
</list>
<disp-formula id="e4">
<mml:math id="m46">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>f</mml:mi>
<mml:mo>&#x02dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x003d;</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2009;</mml:mo>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math> <label>(4)</label>
</disp-formula>and then transformed by a single-layer decoder network with linear activation function into the final output representation <inline-formula id="inf44">
<mml:math id="m47">
<mml:mrow>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>out</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. With the kernel <italic>u</italic> and bias <italic>c</italic> of the decoder, this is written as<disp-formula id="e5">
<mml:math id="m48">
<mml:mrow>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x003d;</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>LR</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>S</mml:mi>
</mml:munderover>
<mml:msubsup>
<mml:mi>u</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mo>&#x2009;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>f</mml:mi>
<mml:mo>&#x02dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x002b;</mml:mo>
<mml:msup>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
</mml:msup>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math> <label>(5)</label>
</disp-formula>This simplified algorithm differs from the original design in the following ways. First, only the mean over vertices is computed at the aggregators, whereas the maximum is also used in the original design. In other words, the aggregators in the original design have<disp-formula id="e6">
<mml:math id="m49">
<mml:mrow>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x003d;</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
<mml:mi>v</mml:mi>
</mml:munder>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math> <label>(6)</label>
</disp-formula>as an additional set of features. Secondly, as already noted, the input feature vector is not used as a part of the input to the decoder network. In the original <sc>GarNet</sc> design, the decoder is expressed as<disp-formula id="e7">
<mml:math id="m50">
<mml:mrow>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x003d;</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>LR</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>S</mml:mi>
</mml:munderover>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>u</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mo>&#x2009;</mml:mo>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x002b;</mml:mo>
<mml:msubsup>
<mml:mi>u</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x002b;</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
<mml:msubsup>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x002b;</mml:mo>
<mml:msup>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math> <label>(7)</label>
</disp-formula>with additional sets of kernel weights <inline-formula id="inf45">
<mml:math id="m51">
<mml:mrow>
<mml:msup>
<mml:mi>u</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf46">
<mml:math id="m52">
<mml:mrow>
<mml:msup>
<mml:mi>w</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Finally, the original design applies a nonlinear (<inline-formula id="inf47">
<mml:math id="m53">
<mml:mrow>
<mml:mtext>tanh</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>) activation function to the decoder, while the simplified version uses a linear activation. In the specific case considered in the next section, these simplifications result in negligible degradation of the network performance. In the remainder of this paper, this simplified version of the algorithm is referred to as <sc>GarNet</sc>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Processing flow of the modified <sc>GarNet</sc> algorithm: <bold>(A)</bold> The input features <inline-formula id="inf48">
<mml:math id="m54">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> of each vertex are processed by a linear network, that returns a new set of features <inline-formula id="inf49">
<mml:math id="m55">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and its distance from the <italic>S</italic> aggregators <inline-formula id="inf50">
<mml:math id="m56">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. <bold>(B)</bold> A graph is built in the learned space, using the <inline-formula id="inf51">
<mml:math id="m57">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> distances. <bold>(C)</bold> A message is gathered by each aggregator, as a weighted sum across the vertices of <inline-formula id="inf52">
<mml:math id="m58">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, with <inline-formula id="inf53">
<mml:math id="m59">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x003d;</mml:mo>
<mml:mtext>exp</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msubsup>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> as weights. <bold>(D)</bold> A message from each aggregator (<inline-formula id="inf54">
<mml:math id="m60">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>f</mml:mi>
<mml:mo>&#x02dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>) is passed back to each vertex, with the same <inline-formula id="inf55">
<mml:math id="m61">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> weight. <bold>(E)</bold> The aggregated outputs of each vertex are given as input to a neural network, which returns the learned representation.</p>
</caption>
<graphic xlink:href="fdata-03-598927-g001.tif"/>
</fig>
<p>It is worth pointing out that while the <sc>GarNet</sc> layer uses only linear activation functions for all of the internal neural networks, it can still learn nonlinear functions through the nonlinearity of the potential function <inline-formula id="inf56">
<mml:math id="m62">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. On the other hand, having no nonlinear activation functions allows a compact FPGA firmware implementation of the layer, consisting mostly of multiplications and additions. The only substantial computation comes with the exponential function, whose values can be pre-computed with sufficient granularity and stored.</p>
<p>An FPGA firmware implementation of the <sc>GarNet</sc> layer using Vivado (<xref ref-type="bibr" rid="B44">O&#x2019;Loughlin et&#x0020;al., 2014</xref>) HLS is integrated into the hls4ml library. The HLS source code is written in C&#x002b;&#x002b; and is provided as a template, from which an HLS function for a <sc>GarNet</sc> layer can be instantiated, specifying the configurable parameters such as <italic>S</italic>, <inline-formula id="inf57">
<mml:math id="m63">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>LR</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf58">
<mml:math id="m64">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>out</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. In the following, we provide some noteworthy details of the implementation.</p>
<p>In the HLS source code of <sc>GarNet</sc>, all quantities appearing in the computation are expressed as either integers or fixed-point numbers with fractional precision of at least eight bits. In particular, the distance parameter <inline-formula id="inf59">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is represented with three integer bits, eight fractional bits, and one sign bit. During the layer computation, <inline-formula id="inf60">
<mml:math id="m66">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is reinterpreted as a 12-bit unsigned integer, which is used to retrieve the corresponding pre-computed value of <inline-formula id="inf61">
<mml:math id="m67">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> from a table with 4,096 entries.</p>
<p>The processing flow in <xref ref-type="disp-formula" rid="e1">Eqs 1</xref>&#x2013;<xref ref-type="disp-formula" rid="e5">5</xref> is compactified in the hls4ml implementation by exploiting the linearity of the encoder, average aggregation, and the decoder. <xref ref-type="disp-formula" rid="e1">Equations 1</xref>, <xref ref-type="disp-formula" rid="e3">3</xref>, and <xref ref-type="disp-formula" rid="e5">5</xref> can be combined into<disp-formula id="e8">
<mml:math id="m68">
<mml:mrow>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>v</mml:mi>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x003d;</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>S</mml:mi>
</mml:munderover>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>w</mml:mi>
<mml:mo>&#x02dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mi>G</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>j</mml:mi>
</mml:msubsup>
<mml:mo>&#x002b;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>b</mml:mi>
<mml:mo>&#x02dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x002b;</mml:mo>
<mml:msup>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math> <label>(8)</label>
</disp-formula>where<disp-formula id="e9">
<mml:math id="m69">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>w</mml:mi>
<mml:mo>&#x02dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mo>&#x003d;</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>LR</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
<mml:msubsup>
<mml:mi>u</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mi>w</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2003;</mml:mtext>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>b</mml:mi>
<mml:mo>&#x02dc;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mo>&#x003d;</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>LR</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munderover>
<mml:msubsup>
<mml:mi>u</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:msup>
<mml:mi>b</mml:mi>
<mml:mi>i</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mtext>&#x2003;</mml:mtext>
<mml:msubsup>
<mml:mi>G</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>j</mml:mi>
</mml:msubsup>
<mml:mo>&#x003d;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>V</mml:mi>
</mml:munderover>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2009;</mml:mo>
<mml:msubsup>
<mml:mi>g</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>j</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mtext>and</mml:mtext>
<mml:mo>&#x2009;</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
<mml:mo>&#x003d;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>V</mml:mi>
</mml:munderover>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math> <label>(9)</label>
</disp-formula>In particular, the kernel and bias tensors of the encoder and decoder are contracted into <inline-formula id="inf62">
<mml:math id="m70">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>w</mml:mi>
<mml:mo>&#x02dc;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf63">
<mml:math id="m71">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>b</mml:mi>
<mml:mo>&#x02dc;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> at logic synthesis time, resulting in fewer steps to arrive at the output from the&#x0020;input.</p>
<p>With this simplification, the input data from each sample are encoded into <inline-formula id="inf64">
<mml:math id="m72">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf65">
<mml:math id="m73">
<mml:mrow>
<mml:msubsup>
<mml:mi>G</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf66">
<mml:math id="m74">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. Therefore, a new sample can be processed as soon as the three quantities from the previous sample are computed. In other words, the II of the overall <sc>GarNet</sc> layer depends on the number of clock cycles needed to compute the three quantities. Furthermore, <inline-formula id="inf67">
<mml:math id="m75">
<mml:mrow>
<mml:msubsup>
<mml:mi>G</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf68">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mi>a</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> can be derived trivially from <inline-formula id="inf69">
<mml:math id="m77">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, making the latency of the computation of the latter the critical determinant of the throughput of the algorithm.</p>
<p>The computation of <inline-formula id="inf70">
<mml:math id="m78">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is performed independently on each vertex, and is therefore parallelizable across the vertices. In a fully parallelized implementation, there would be <inline-formula id="inf71">
<mml:math id="m79">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> logic units (one unit per vertex) operated simultaneously. However, with <italic>V</italic> typically as large as <inline-formula id="inf72">
<mml:math id="m80">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> or greater, this configuration would consume too much of the FPGA resources and would not fit on a single chip. Therefore, the hls4ml implementation of <sc>GarNet</sc> allows a partial parallelization of the algorithm controlled by a parameter called the <italic>reuse factor</italic> (<inline-formula id="inf73">
<mml:math id="m81">
<mml:mrow>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mtext>reuse</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>). For <inline-formula id="inf74">
<mml:math id="m82">
<mml:mrow>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mtext>reuse</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x003e;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, the logic unit to compute <inline-formula id="inf75">
<mml:math id="m83">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is cloned <inline-formula id="inf76">
<mml:math id="m84">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mtext>reuse</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> times, such that each unit is reused serially up to <inline-formula id="inf77">
<mml:math id="m85">
<mml:mrow>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mtext>reuse</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> times. This serial reuse is fully pipelined with the local II of one clock cycle. The latency <inline-formula id="inf78">
<mml:math id="m86">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>W</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> for computing <inline-formula id="inf79">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> for all vertices is therefore given by<disp-formula id="e10">
<mml:math id="m88">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi>W</mml:mi>
</mml:msub>
<mml:mo>&#x003d;</mml:mo>
<mml:msubsup>
<mml:mi>T</mml:mi>
<mml:mi>W</mml:mi>
<mml:mn>0</mml:mn>
</mml:msubsup>
<mml:mo>&#x002b;</mml:mo>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mtext>reuse</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math> <label>(10)</label>
</disp-formula>where <inline-formula id="inf80">
<mml:math id="m89">
<mml:mrow>
<mml:msubsup>
<mml:mi>T</mml:mi>
<mml:mi>W</mml:mi>
<mml:mn>0</mml:mn>
</mml:msubsup>
<mml:mo>&#x223c;</mml:mo>
<mml:mn>20</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> is the number of clock cycles needed to compute <inline-formula id="inf81">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> for one vertex. The value of <inline-formula id="inf82">
<mml:math id="m91">
<mml:mrow>
<mml:msubsup>
<mml:mi>T</mml:mi>
<mml:mi>W</mml:mi>
<mml:mn>0</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> depends on the numerical precision of the fixed-point numbers in the computation.</p>
<p>Finally, the kernel and bias of the encoder and the kernel of the decoder can be quantized, such that each element takes only values <inline-formula id="inf83">
<mml:math id="m92">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, 0, or 1 (ternary quantization) (<xref ref-type="bibr" rid="B60">Zhu et&#x0020;al., 2017</xref>). In the quantized version of the algorithm, contracted kernel and bias <inline-formula id="inf84">
<mml:math id="m93">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>w</mml:mi>
<mml:mo>&#x02dc;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf85">
<mml:math id="m94">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>b</mml:mi>
<mml:mo>&#x02dc;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> have elements that are <inline-formula id="inf86">
<mml:math id="m95">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> integers. Multiplication of small integers with fixed-point numbers can be performed in FPGAs using LUTs rather than DSPs, which are usually the more scarce resource. Multiplications with LUTs also proceed faster than those with&#x0020;DSPs.</p>
</sec>
<sec id="s5">
<label>5.</label>
<title> Case Study: Particle Identification and Energy Regression in an Imaging Calorimeter</title>
<p>As a case study, the hls4ml implementation of <sc>GarNet</sc> is applied to a representative task for the LHC L1T, namely reconstructing electrons and pions in a simulated 3D imaging calorimeter. In the following, we first describe the dataset used for the study, then define the task and the architectures of the ML models, and present the inference performance of the models and the resource usage of the synthesized firmware.</p>
<sec id="s5-1">
<label>5.1.</label>
<title> Dataset</title>
<p>The calorimeter is a multi-layered full-absorption detector with a geometry similar to the one described in <xref ref-type="bibr" rid="B47">Qasim et&#x0020;al., (2019b)</xref>. The detector is made entirely of tungsten, which is considered as both an absorber and a sensitive material, and no noise or threshold effects in the readout electronics are simulated. While this homogeneous calorimeter design is not a faithful representation of a modern sampling calorimeter, this simplification allows us to evaluate the performance of the ML models decoupled from detector effects.</p>
<p>The calorimeter extends 36&#x00a0;cm in <italic>x</italic> and <italic>y</italic> and has a total depth in <italic>z</italic> of 2&#x00a0;m, corresponding to approximately 20 nuclear interaction lengths and 170 radiation lengths. The coordinate origin is placed at the center of the front face of the calorimeter. The calorimeter is segmented into 50 layers along <italic>z</italic>, with each layer divided into small square cells in the <italic>x</italic>-<italic>y</italic> plane, forming a three-dimensional imaging detector. Cells are oriented so their sides are parallel to the <italic>x</italic> and <italic>y</italic> axes. Tiling of the cells in each layer is uniform except for in one quadrant, where the cell sides are half as long as those in the other area. The aim of the tiling is to incorporate the irregularity of the geometry of a real-life particle physics calorimeter. The quadrant with smaller cells and the remainder of the layer are respectively called the high granularity (HG) and low granularity (LG) regions. The first 25 layers in <italic>z</italic> correspond to the electromagnetic calorimeter, with a layer thickness of 1&#x00a0;cm and cell dimensions of 2.25&#x00a0;cm <inline-formula id="inf87">
<mml:math id="m96">
<mml:mo>&#x00d7;</mml:mo>
</mml:math>
</inline-formula> 2.25&#x00a0;cm in the HG region (4.5&#x00a0;cm <inline-formula id="inf88">
<mml:math id="m97">
<mml:mo>&#x00d7;</mml:mo>
</mml:math>
</inline-formula> 4.5&#x00a0;cm in LG). The remaining 25 layers correspond to the hadron calorimeter, with a layer thickness of 7&#x00a0;cm and cell dimensions of 3&#x00a0;cm <inline-formula id="inf89">
<mml:math id="m98">
<mml:mo>&#x00d7;</mml:mo>
</mml:math>
</inline-formula> 3&#x00a0;cm in the HG region (6&#x00a0;cm <inline-formula id="inf90">
<mml:math id="m99">
<mml:mo>&#x00d7;</mml:mo>
</mml:math>
</inline-formula> 6&#x00a0;cm in LG). Schematics of the cell tiling in the electromagnetic and hadron parts are shown in <xref ref-type="fig" rid="F2">Figure&#x0020;2</xref>. The geometry and the detector response to particles are simulated using <sc>Geant4</sc> (<xref ref-type="bibr" rid="B5">Agostinelli et&#x0020;al., 2003</xref>).</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Schematics of the high-granularity and low-granularity regions of the <bold>(A)</bold> electromagnetic and <bold>(B)</bold> hadron layers.</p>
</caption>
<graphic xlink:href="fdata-03-598927-g002.tif"/>
</fig>
<p>Each event used in this study contains a high-energy <italic>primary</italic> particle and low-energy <italic>pileup</italic> particles, which represent backgrounds from simultaneous additional proton-proton interactions. The primary particle is either an electron (<inline-formula id="inf91">
<mml:math id="m100">
<mml:mrow>
<mml:msup>
<mml:mtext>e</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>) or a charged pion (<inline-formula id="inf92">
<mml:math id="m101">
<mml:mrow>
<mml:msup>
<mml:mi>&#x03c0;</mml:mi>
<mml:mo>&#x00b1;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>), shot at the calorimeter with momentum aligned along the <italic>z</italic> axis, i.e.,&#x0020;perpendicular to the front face of the calorimeter. The <italic>x</italic> and <italic>y</italic> coordinates of the particle&#x2019;s origin are randomly sampled according to a uniform distribution in a 10&#x00a0;cm <inline-formula id="inf93">
<mml:math id="m102">
<mml:mo>&#x00d7;</mml:mo>
</mml:math>
</inline-formula> 10&#x00a0;cm region centered at <inline-formula id="inf94">
<mml:math id="m103">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mi>y</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. Following this procedure, we aim to mimic a realistic situation in which the actual calorimeter extends to a much larger surface and the area covered by the geometry used in this study represents a portion of it. The value of the particle momentum is drawn randomly for each event from a uniform distribution between 10 and 100&#x00a0;GeV. The pileup particles consist of photons (<italic>&#x03b3;</italic>) and <inline-formula id="inf95">
<mml:math id="m104">
<mml:mrow>
<mml:msup>
<mml:mi>&#x03c0;</mml:mi>
<mml:mo>&#x00b1;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The number of pileup particles is randomly sampled from a Poisson distribution with a mean of 40, with the <inline-formula id="inf96">
<mml:math id="m105">
<mml:mrow>
<mml:msup>
<mml:mi>&#x03c0;</mml:mi>
<mml:mo>&#x00b1;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> multiplicity fixed to twice the <italic>&#x03b3;</italic> multiplicity. This setup approximates the flux of pileup particles expected at a pseudorapdity <inline-formula id="inf97">
<mml:math id="m106">
<mml:mrow>
<mml:mi>&#x03b7;</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> in a <inline-formula id="inf98">
<mml:math id="m107">
<mml:mrow>
<mml:mtext>&#x0394;</mml:mtext>
<mml:mi>&#x03b7;</mml:mi>
<mml:mo>&#x00d7;</mml:mo>
<mml:mtext>&#x0394;</mml:mtext>
<mml:mi>&#x03d5;</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>0.4</mml:mn>
<mml:mo>&#x00d7;</mml:mo>
<mml:mn>0.4</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> patch of the forward region of an LHC detector during the High-Luminosity LHC (HL-LHC) phase (<xref ref-type="bibr" rid="B7">Apollinari et&#x0020;al., 2017</xref>). The momentum direction and the window of origin of the pileup particles are the same as the primary particle. The momentum value of the pileup particles is sampled from a Landau distribution with <inline-formula id="inf99">
<mml:math id="m108">
<mml:mrow>
<mml:mi>&#x03bc;</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>0.6</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> GeV and <inline-formula id="inf100">
<mml:math id="m109">
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>0.5</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> GeV, in a range of 0&#x2013;20&#x00a0;GeV.</p>
<p>The output of the simulation for each event is the array of total energy deposition values by the particles at individual detector cells (hits). Energy depositions by the particles in the homogeneous calorimeter are recorded exactly, i.e.,&#x0020;the detector output does not require calibration and is not affected by stochastic&#x0020;noise.</p>
<p>In an L1T system, hits containing energy depositions from a potentially interesting particle would be identified through a low-latency clustering algorithm. The clustering algorithm used in this study mimics the one planned for the L1T system of the HGCAL detector in CMS (<xref ref-type="bibr" rid="B16">CMS Collaboration, 2017a</xref>). In this approach, the hit with the largest energy deposition in the event is elected to be the seed, and the cluster consists of all hits contained in a cylinder whose axis passes through the center of the seed cell and extends along the <italic>z</italic> direction. The radius of the cylinder is set at 6.4&#x00a0;cm so that the resulting cluster contains 95% of the energy of the primary particle for 50% of the pion events. Because electromagnetic showers have a narrower energy spread than hadronic showers in general, all of the electron events have at least 95% of the energy contained in the same cylinder. Typical events with momenta of the primary particles around 50&#x00a0;GeV and the total pileup energy close to the median of the distribution are shown in <xref ref-type="fig" rid="F3">Figures 3A and 3B</xref>. The hits in the figure are colored by the fraction of the hit energy due to the primary particle (primary fraction, <inline-formula id="inf101">
<mml:math id="m110">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>prim</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) to help the visualization.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Examples of electron <bold>(A)</bold>, <bold>(C)</bold> and pion <bold>(B)</bold>, <bold>(D)</bold> events. Values in parentheses in the graph titles are the respective energy depositions contained in the cluster around the seed hit. Points represent hits in the detector, with their coordinates at the center of the corresponding detector cells and the size of the markers proportional to the square root of the hit energy. Opaque points are within the cluster, while the translucent ones are not. In <bold>(A)</bold> and <bold>(B)</bold>, the point color scale from blue to red corresponds to the primary fraction (see <xref ref-type="sec" rid="s5-1">Section 5.1</xref> for definition). In <bold>(C)</bold> and <bold>(D)</bold>, the color scale from blue to green corresponds to <inline-formula id="inf102">
<mml:math id="m111">
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mtext>&#x0394;</mml:mtext>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>pred</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mrow>
<mml:mtext>&#x0394;</mml:mtext>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, which is an indication of the importance the neural network model places to individual hits for energy regression. See <xref ref-type="sec" rid="s5-3">Section 5.3</xref> for details.</p>
</caption>
<graphic xlink:href="fdata-03-598927-g003.tif"/>
</fig>
<p>The actual dataset used in this study thus contains one cluster per sample, given as an array of hits in the cluster, and one integer indicating the number of hits in the sample. Only the hits with energy greater than 120&#x00a0;MeV are considered. Each cluster contains at most 128 hits, sorted by hit energy in decreasing order. Note that sorting of the hit has no effect on the neural network, and is only relevant when truncating the list of hits to consider smaller clusters, as explored later. In fact, 0.2% of the events resulted in clusters with more than 128 hits, for which the lowest energy hits were discarded from the dataset. Each hit is represented by four numbers, corresponding to the hit coordinates, given in <italic>x</italic>, <italic>y</italic>, and <italic>z</italic>, and energy. The <italic>x</italic> and <italic>y</italic> coordinates are relative to the seed cell. The dataset consists of 500,000 samples, split evenly and randomly into <inline-formula id="inf103">
<mml:math id="m112">
<mml:mrow>
<mml:msup>
<mml:mtext>e</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf104">
<mml:math id="m113">
<mml:mrow>
<mml:msup>
<mml:mi>&#x03c0;</mml:mi>
<mml:mo>&#x00b1;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> events, stored as <sc>NumPy</sc> (<xref ref-type="bibr" rid="B53">van der Walt et&#x0020;al., 2011</xref>; <xref ref-type="bibr" rid="B26">Harris et&#x0020;al., 2020</xref>) arrays in <sc>HDF5</sc> format (<xref ref-type="bibr" rid="B52">The HDF Group, 2020</xref>). The dataset together with the ground truth information is available on the Zenodo platform (<xref ref-type="bibr" rid="B29">Iiyama and Kieseler, 2020</xref>).</p>
</sec>
<sec id="s5-2">
<label>5.2.</label>
<title> Task and Model Architecture</title>
<p>The task in this study is to identify the nature of the primary particle and to simultaneously predict its energy, given the hits in the cluster. The ability to reliably identify the particle type and estimate its energy at the cluster level in a local calorimeter trigger system greatly enhances the efficacy of high-level algorithms, such as particle-flow reconstruction (<xref ref-type="bibr" rid="B6">ALEPH Collaboration, 1995</xref>; <xref ref-type="bibr" rid="B9">ATLAS Collaboration, 2017</xref>; <xref ref-type="bibr" rid="B17">CMS Collaboration, 2017b</xref>), downstream in the L1T system. However, because of the distortion of the energy deposition pattern in the cluster due to pileup, particle identification based on collective properties of the hits, such as the depth of the energy center of mass, can achieve only modest accuracy. Furthermore, only half of the pion events have 95% of the energy deposition from the pion contained in the cluster, requiring substantial extrapolation in the energy prediction. This task is thus both practically relevant and sufficiently nontrivial as a test bench of a <sc>GarNet</sc>-based ML&#x0020;model.</p>
<p>The architecture of the model is as follows. First, the input data represented by a two-dimensional array of <inline-formula id="inf105">
<mml:math id="m114">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x00d7;</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>in</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> numbers per cluster are processed by a stack of three <sc>GarNet</sc> layers. The parameters <inline-formula id="inf106">
<mml:math id="m115">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>LR</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mtext>out</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> for the first two layers are <inline-formula id="inf107">
<mml:math id="m116">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>4,8,8</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and for the last layer are <inline-formula id="inf108">
<mml:math id="m117">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>8,16,16</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. The output of the third <sc>GarNet</sc> layer is averaged across the vertices for each of the 16 features. The resulting array of 16 numbers is then passed through two fully connected layers with 16 and 8 nodes and ReLU (<xref ref-type="bibr" rid="B4">Agarap, 2018</xref>) activation. Data flow is split into two branches in the final step. The first branch consists of a fully connected layer with a single node, whose output is activated by a sigmoid function and is interpreted as the classification prediction, i.e.,&#x0020;the predicted probability that the primary particle is an electron. The other branch also consists of a single-node fully connected layer, but with a linear activation of the output, which is interpreted as the predicted value of the energy of the particle.</p>
<p>This model is built in <sc>Keras</sc> (<xref ref-type="bibr" rid="B33">Keras, 2015</xref>), using the corresponding implementation of <sc>GarNet</sc> available in <xref ref-type="bibr" rid="B46">Qasim et&#x0020;al., (2019a)</xref>. In total, the model has 3,402 trainable parameters (2,976 in the three <sc>GarNet</sc> layers), whose values are optimized through a supervised training process using the Adam optimizer (<xref ref-type="bibr" rid="B35">Kingma and Ba, 2014</xref>). Input is processed in batches of 64 samples during training. The overall objective function that is minimized in the training is a weighted sum of objective functions for the classification and regression tasks:<disp-formula id="e11">
<mml:math id="m118">
<mml:mrow>
<mml:mi mathvariant="normal">&#x2112;</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mi>&#x03b2;</mml:mi>
<mml:msub>
<mml:mi mathvariant="normal">&#x2112;</mml:mi>
<mml:mrow>
<mml:mtext>class</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x002b;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x03b2;</mml:mi>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x2112;</mml:mi>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math> <label>(11)</label>
</disp-formula>with <inline-formula id="inf109">
<mml:math id="m119">
<mml:mrow>
<mml:mi>&#x03b2;</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>0.01</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. The objective function for classification <inline-formula id="inf110">
<mml:math id="m120">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x2112;</mml:mi>
<mml:mrow>
<mml:mtext>class</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the binary cross entropy in each batch between the truth labels (electrons are represented by 1 and pions by 0) and the classification output of the model. The objective function for regression <inline-formula id="inf111">
<mml:math id="m121">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x2112;</mml:mi>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the batch mean of the relative squared error<disp-formula id="e12">
<mml:math id="m122">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x2112;</mml:mi>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x003d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mo>[</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>pred</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>true</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>true</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>]</mml:mo>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math> <label>(12)</label>
</disp-formula>where <inline-formula id="inf112">
<mml:math id="m123">
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>pred</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf113">
<mml:math id="m124">
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>true</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the predicted and true energies of the primary particle, respectively. The training is performed on 400,000 training and 100,000 validation samples over a few hundred epochs, with early stopping when the value of the objective function does not improve for ten consecutive epochs. Keeping the full training dataset on RAM and using two NVIDIA GeForce RTX 2080 Ti GPUs in parallel, each epoch takes roughly 30&#x00a0;s to process.</p>
<p>Additionally, we prepare a model in which the encoders and decoders of the <sc>GarNet</sc> layers are quantized as ternary networks using <sc>QKeras</sc> (<xref ref-type="bibr" rid="B19">Coelho et&#x0020;al., 2020</xref>; <xref ref-type="bibr" rid="B48">Qkeras, 2020</xref>), which performs quantization-aware training with the straight-through estimator by quantizing the layers during a forward pass but not a backward pass (<xref ref-type="bibr" rid="B20">Courbariaux et&#x0020;al., 2015</xref>; <xref ref-type="bibr" rid="B59">Zhou et&#x0020;al., 2016</xref>; <xref ref-type="bibr" rid="B39">Moons et&#x0020;al., 2017</xref>; <xref ref-type="bibr" rid="B19">Coelho et&#x0020;al., 2020</xref>). In the following, this model is referred to as the <italic>quantized model</italic>, and the original model as the <italic>continuous model</italic>. The quantized model is trained with the same objective function and training hyperparameters as the continuous&#x0020;model.</p>
<p>To evaluate the inference performance of the trained models, reference algorithms are defined separately for the classification and regression subtasks. The reference algorithm for classification (<italic>cut-based</italic> classification) computes the energy-weighted mean <inline-formula id="inf114">
<mml:math id="m125">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>z</mml:mi>
<mml:mo>&#x00af;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> and standard deviation <inline-formula id="inf115">
<mml:math id="m126">
<mml:mrow>
<mml:msub>
<mml:mi>&#x03c3;</mml:mi>
<mml:mi>z</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> of the <italic>z</italic> coordinates of the hits,<disp-formula id="e13">
<mml:math id="m127">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>z</mml:mi>
<mml:mo>&#x00af;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x003d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:msup>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mtext>&#x200b;</mml:mtext>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>V</mml:mi>
</mml:msubsup>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:msup>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mtext>&#x200b;</mml:mtext>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>V</mml:mi>
</mml:msubsup>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mtext>and</mml:mtext>
<mml:msub>
<mml:mi>&#x03c3;</mml:mi>
<mml:mi>z</mml:mi>
</mml:msub>
<mml:mo>&#x003d;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:msup>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mtext>&#x200b;</mml:mtext>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>V</mml:mi>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>z</mml:mi>
<mml:mo>&#x00af;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:msup>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mtext>&#x200b;</mml:mtext>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>V</mml:mi>
</mml:msubsup>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:msqrt>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math> <label>(13)</label>
</disp-formula>where <italic>i</italic> is the index of hits in the cluster and <inline-formula id="inf116">
<mml:math id="m128">
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf117">
<mml:math id="m129">
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the <italic>z</italic> coordinate and energy of the <italic>i</italic>th hit. The cluster is labeled as an electron if <inline-formula id="inf118">
<mml:math id="m130">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>z</mml:mi>
<mml:mo>&#x00af;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x003c;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>z</mml:mi>
<mml:mo>&#x00af;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>cut</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf119">
<mml:math id="m131">
<mml:mrow>
<mml:msub>
<mml:mi>&#x03c3;</mml:mi>
<mml:mi>z</mml:mi>
</mml:msub>
<mml:mo>&#x003c;</mml:mo>
<mml:msubsup>
<mml:mi>&#x03c3;</mml:mi>
<mml:mi>z</mml:mi>
<mml:mrow>
<mml:mtext>cut</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, where <inline-formula id="inf120">
<mml:math id="m132">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mi>z</mml:mi>
<mml:mo>&#x00af;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mtext>cut</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf121">
<mml:math id="m133">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x03c3;</mml:mi>
<mml:mi>z</mml:mi>
<mml:mrow>
<mml:mtext>cut</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> are predefined thresholds. Pions, and hadrons in general, tend to penetrate deeper in an absorbing detector and create showers of secondary particles with a larger transverse size than electrons and photons. For regression, the reference algorithm (<italic>weight-based</italic> regression) predicts the energy of the primary particle through a formula<disp-formula id="e14">
<mml:math id="m134">
<mml:mrow>
<mml:msubsup>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>pred</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>ref</mml:mtext>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x003d;</mml:mo>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>V</mml:mi>
</mml:munderover>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x002b;</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math> <label>(14)</label>
</disp-formula>where <inline-formula id="inf122">
<mml:math id="m135">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the detector <italic>z</italic> layer of hit <italic>i</italic>. Parameters <inline-formula id="inf123">
<mml:math id="m136">
<mml:mrow>
<mml:mrow>
<mml:mo>{</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mo>}</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mn>50</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> are determined by minimizing <inline-formula id="inf124">
<mml:math id="m137">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x2112;</mml:mi>
<mml:mrow>
<mml:mtext>reg</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> over the training dataset using <inline-formula id="inf125">
<mml:math id="m138">
<mml:mrow>
<mml:msubsup>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>pred</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>ref</mml:mtext>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> as the predicted energy. Particle identification based on the energy deposition profile of the cluster and energy estimation based on weighted sum of hit energies are both common strategies in the conventional, non-ML-based event reconstruction approaches.</p>
</sec>
<sec id="s5-3">
<label>5.3.</label>
<title> Training Result</title>
<p>Performance of the trained continuous and quantized models, evaluated using the validation sample, are shown in <xref ref-type="fig" rid="F4">Figure&#x0020;4</xref>. For each ML model, the inference results based on the original <sc>Keras</sc> model and the HLS model, converted using hls4ml, are shown. The HLS model provides a realistic emulation of the synthesized FPGA firmware.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Classification <bold>(A)</bold> and regression <bold>(B)</bold> inference performance of the continuous and quantized <sc>GarNet</sc>-based models and the reference algorithms. Results from the <sc>Keras</sc> and HLS implementations are shown for the <sc>GarNet</sc>-based models. The classification performance is quantified with a ROC curve of electron identification efficiency vs. pion rejection efficiency. The inset in <bold>(A)</bold> shows a close-up view of the efficiency range 0.90&#x2013;0.96 for both axes. The regression performance is quantified as the response <inline-formula id="inf126">
<mml:math id="m139">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>pred</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>true</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> in 10&#x00a0;GeV bins of <inline-formula id="inf127">
<mml:math id="m140">
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>true</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The horizontal line in the box corresponds to the median of the distribution, the top and bottom of the box to the upper and lower quartiles, and the upper and lower ends of the whiskers to the 95th and 5th percentiles.</p>
</caption>
<graphic xlink:href="fdata-03-598927-g004.tif"/>
</fig>
<p>The classification performance is given in terms of receiver operating characteristic (ROC) curves that trace the electron identification efficiency (true positive fraction) and pion rejection efficiency (true negative fraction) for different thresholds of the classifiers. The two <sc>GarNet</sc>-based models perform similarly and better than the cut-based reference in terms of the electron identification efficiency for a given pion rejection efficiency. A detailed comparison of the four sets of results from the <sc>GarNet</sc>-based models in the inset reveals that the continuous model performs slightly better than the quantized model, and that the difference between the <sc>Keras</sc> and HLS implementations is smaller for the quantized&#x0020;model.</p>
<p>The regression performance is given in terms of the response <inline-formula id="inf128">
<mml:math id="m141">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>pred</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>true</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Distributions of the response are summarized in 10&#x00a0;GeV bins of <inline-formula id="inf129">
<mml:math id="m142">
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>true</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, separately for the continuous model, quantized model, and the weight-based reference. In each summary, the horizontal line in the box corresponds to the median of the distribution, the top and bottom of the box to the upper and lower quartiles, and the upper and lower ends of the whiskers to the 95th and 5th percentiles. The <sc>GarNet</sc>-based models exhibit narrower spreads of the response distributions in most of the bins, with the continuous model again performing slightly better than the quantized&#x0020;model.</p>
<p>The differences between the <sc>Keras</sc> and HLS implementations are due to the numerical precision in the computation. While the former represents all fractional numbers in 32-bit floating-point numbers, the latter employs fixed-point numbers with bit widths of at most 18. Consequently, for the quantized model, where the encoder and decoder of the <sc>GarNet</sc> layers employ integer weights for inference, the difference between the two implementations are smaller.</p>
<p>For both subtasks, the <sc>GarNet</sc>-based models generally outperform the reference algorithms. The reference algorithm has narrower spread of the response in some energy bins for the regression subtask. However, it is important to note that the weights and biases appearing in <xref ref-type="disp-formula" rid="e14">Eq. 14</xref> are optimized for a specific pileup profile, while in a real particle collider environment, pileup flux changes dynamically even on the timescale of a few hours. In contrast, algorithms based on inference of properties of individual hits, such as the <sc>GarNet</sc>-based models presented in this study, are expected to be able to identify hits due to pileup even under different pileup environments and thus to have a stable inference performance with respect to change in pileup flux. Since a detailed evaluation of application-specific performance of <sc>GarNet</sc> is not within the scope of this work, we leave this and other possible improvements to the model architecture and training to future studies.</p>
<p>To verify that <sc>GarNet</sc> can infer relations between individual vertices without edges <inline-formula id="inf130">
<mml:math id="m143">
<mml:mi mathvariant="normal">&#x2130;</mml:mi>
</mml:math>
</inline-formula> in the input, the following test is performed. Using the two events shown in <xref ref-type="fig" rid="F3">Figure&#x0020;3</xref>, the energy of each hit in the clusters is increased one at a time by 10%, and the inference with the continuous model is performed for each perturbed event. If the model has learned to perfectly distinguish the primary particle from pileup at the vertex level, a small change in the energy of a hit from pileup should result in no change in the predicted particle energy. In <xref ref-type="fig" rid="F3">Figures 3C and 3D</xref>, each hit in the cluster is colored by the ratio of the change of predicted particle energy and the amount of perturbation (<inline-formula id="inf131">
<mml:math id="m144">
<mml:mrow>
<mml:mtext>&#x0394;</mml:mtext>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>pred</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mtext>&#x0394;</mml:mtext>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>). While some hits with <inline-formula id="inf132">
<mml:math id="m145">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>prim</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> appear with <inline-formula id="inf133">
<mml:math id="m146">
<mml:mrow>
<mml:mtext>&#x0394;</mml:mtext>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>pred</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mtext>&#x0394;</mml:mtext>
<mml:mi>h</mml:mi>
<mml:mo>&#x003e;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, a general correspondence between <inline-formula id="inf134">
<mml:math id="m147">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mtext>prim</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf135">
<mml:math id="m148">
<mml:mrow>
<mml:mtext>&#x0394;</mml:mtext>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>pred</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mtext>&#x0394;</mml:mtext>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is observed. The occurrence of <inline-formula id="inf136">
<mml:math id="m149">
<mml:mrow>
<mml:mtext>&#x0394;</mml:mtext>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mtext>pred</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mtext>&#x0394;</mml:mtext>
<mml:mi>h</mml:mi>
<mml:mo>&#x003e;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> is expected, given the extrapolation required to predict the full particle energy from the energy of the hits included in the cluster. With this test, we are able to probe how the <sc>GarNet</sc>-based model is learning the structure of the&#x0020;graph.</p>
</sec>
<sec id="s5-4">
<label>5.4.</label>
<title> Model Synthesis and Performance</title>
<p>The latency, II, and resource usage of the FPGA firmware synthesized from the HLS implementations are summarized in <xref ref-type="table" rid="T1">Table&#x0020;1</xref>. Vitis Core Development Kit 2019.2 (<xref ref-type="bibr" rid="B32">Kathail, 2020</xref>) is used for synthesis, with a Xilinx Kintex UltraScale FPGA (part number xcku115-flvb2104-2-i) as the target device and a clock frequency of 200&#x00a0;MHz. The reported resource usage numbers reflect the synthesis estimates from Vivado HLS. The latency and II reported here are the maximum values for samples with full <inline-formula id="inf137">
<mml:math id="m150">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> vertices; the actual HLS implementation allows early termination of the serial reuse of the vertex-processing logic unit for samples with fewer vertices. The area under the ROC curve (AUC) and overall response root mean square (RMS) are used to summarize the performance.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Summary of the latency, II, FPGA resource usage metrics, and inference accuracy metrics of the synthesized firmware. The reported resource usage numbers reflect the synthesis estimates from Vivado HLS. The target FPGA is a Xilinx Kintex UltraScale FPGA (part number xcku115-flvb2104-2-i), which has 5,520 DSPs, 663,360 LUTs, 1,326,720 FFs, and 77.8&#x00a0;Mb of BRAM (<xref ref-type="bibr" rid="B56">Xilinx, 2020</xref>). The utilized percentage of the targeted FPGA resources are denoted in the square brackets.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th>Model</th>
<th align="center">
<inline-formula id="inf138">
<mml:math id="m151">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">
<inline-formula id="inf139">
<mml:math id="m152">
<mml:mrow>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mtext>reuse</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</th>
<th align="center">Latency (Cycles)</th>
<th align="center">Interval (Cycles)</th>
<th align="center">DSP (<inline-formula id="inf140">
<mml:math id="m153">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>)</th>
<th align="center">LUT (<inline-formula id="inf141">
<mml:math id="m154">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>)</th>
<th align="center">FF (<inline-formula id="inf142">
<mml:math id="m155">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>)</th>
<th align="center">BRAM (Mb)</th>
<th align="center">ROC AUC</th>
<th align="center">Response RMS</th>
</tr>
</thead>
<tbody>
<tr>
<td>Continuous</td>
<td>128</td>
<td>32</td>
<td>155</td>
<td>55</td>
<td>3.1 [56%]</td>
<td>57 [9%]</td>
<td>39 [2.9%]</td>
<td>1.8 [2.3%]</td>
<td>0.98</td>
<td>0.23</td>
</tr>
<tr>
<td>Quantized</td>
<td>128</td>
<td>32</td>
<td>148</td>
<td>50</td>
<td>1.6 [29%]</td>
<td>70 [11%]</td>
<td>41 [3.1%]</td>
<td>1.9 [2.4%]</td>
<td>0.98</td>
<td>0.24</td>
</tr>
<tr>
<td>Quantized</td>
<td>64</td>
<td>16</td>
<td>99</td>
<td>34</td>
<td>1.6 [29%]</td>
<td>63 [9%]</td>
<td>38 [2.9%]</td>
<td>1.8 [2.3%]</td>
<td>0.96</td>
<td>0.24</td>
</tr>
<tr>
<td>Quantized</td>
<td>32</td>
<td>8</td>
<td>75</td>
<td>26</td>
<td>1.4 [25%]</td>
<td>52 [8%]</td>
<td>33 [2.5%]</td>
<td>1.8 [2.3%]</td>
<td>0.86</td>
<td>0.37</td>
</tr>
<tr>
<td>Quantized</td>
<td>16</td>
<td>4</td>
<td>63</td>
<td>22</td>
<td>1.5 [27%]</td>
<td>57 [9%]</td>
<td>37 [2.8%]</td>
<td>1.8 [2.3%]</td>
<td>0.64</td>
<td>0.36</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Comparing the continuous and quantized models with <inline-formula id="inf143">
<mml:math id="m156">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>128</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, the former has a longer latency and II and consumes substantially more DSPs. On the other hand, the quantized model uses more LUTs, mainly for the multiplications in the <sc>GarNet</sc> encoders and decoders, as discussed in <xref ref-type="sec" rid="s4">Section 4</xref>. However, it is known that the expected LUT usage tend to be overestimated in Vivado HLS, while the expected DSP usage tends to be accurate (<xref ref-type="bibr" rid="B22">Duarte et&#x0020;al., 2018</xref>; <xref ref-type="bibr" rid="B21">Di Guglielmo et&#x0020;al., 2020</xref>). The DSP usage of <inline-formula id="inf144">
<mml:math id="m157">
<mml:mrow>
<mml:mn>3.1</mml:mn>
<mml:mo>&#x00d7;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> for the continuous model is well within the limit of the target device, but is more than what is available on a single die slice (<inline-formula id="inf145">
<mml:math id="m158">
<mml:mrow>
<mml:mn>2.8</mml:mn>
<mml:mo>&#x00d7;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mn>10</mml:mn>
</mml:mrow>
<mml:mn>3</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>) (<xref ref-type="bibr" rid="B56">Xilinx, 2020</xref>). The quantized model fits in one slice in all metrics. Given the small difference in the inference performance between the two models, it is clear that the quantized model is advantageous for this specific case&#x0020;study.</p>
<p>The latency of the synthesized quantized model at 148 clock periods, corresponding to 740<inline-formula id="inf146">
<mml:math id="m159">
<mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mtext>ns</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>, satisfies the LHC L1T requirement of <inline-formula id="inf147">
<mml:math id="m160">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mi>&#x03bc;</mml:mi>
<mml:mtext>s</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> execution. However, the II of 50 clock periods (250<inline-formula id="inf148">
<mml:math id="m161">
<mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mtext>ns</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>) implies that the logic must be time-multiplexed tenfold to be able to process a single cluster per LHC beam crossing period of 25<inline-formula id="inf149">
<mml:math id="m162">
<mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mtext>ns</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>. With <inline-formula id="inf150">
<mml:math id="m163">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> or more clusters expected per beam crossing in the collision environment of HL-LHC, the throughput of the synthesized firmware is therefore inadequate for a reasonably sized L1T calorimeter system with <inline-formula id="inf151">
<mml:math id="m164">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>100</mml:mn>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> FPGAs, and requires down-scoping or implementation improvements.</p>
<p>The simplest down-scoping measure is to reduce the size of the input. This is effective because the most prominent factor driving both the latency and the II of the firmware is <inline-formula id="inf152">
<mml:math id="m165">
<mml:mrow>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mtext>reuse</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (see <xref ref-type="disp-formula" rid="e10">Eq. 10</xref>), which in turn is determined by <inline-formula id="inf153">
<mml:math id="m166">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> to be able to fit the logic in a single chip. To test how short the II can be made while retaining a reasonable inference performance, additional models with <inline-formula id="inf154">
<mml:math id="m167">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, 32, and 16 are trained and synthesized into FPGA firmware. Clusters with more hits than <inline-formula id="inf155">
<mml:math id="m168">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are truncated by discarding the lowest energy hits. The fraction of truncated clusters for the three <inline-formula id="inf156">
<mml:math id="m169">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> values are 27%, 85%, and 99%, respectively.</p>
<p>The results of synthesis of the additional models are given in the last three rows of <xref ref-type="table" rid="T1">Table&#x0020;1</xref>. The values of FPGA resource usage metrics are similar in all quantized models because the ratio <inline-formula id="inf157">
<mml:math id="m170">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mtext>reuse</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is kept at 4. The area under the ROC curve (AUC) and the root-mean-square (RMS) of the response are considered as metrics for the inference performance. Only a modest degradation of performance is observed by truncating the clusters to <inline-formula id="inf158">
<mml:math id="m171">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mtext>max</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x003d;</mml:mo>
<mml:mn>64</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, while the II is reduced by 16 clocks as a direct result of the reduction of <inline-formula id="inf159">
<mml:math id="m172">
<mml:mrow>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mtext>reuse</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> by the same amount. This working point might thus represent a reasonable compromise between the inference performance and throughput. Further cluster truncation results in considerable loss of inference accuracy. It is also clear that reduction of <inline-formula id="inf160">
<mml:math id="m173">
<mml:mrow>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mtext>reuse</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> has a diminishing return in terms of shorter II, and improvements to other parts of the algorithm are necessary to further reduce the&#x0020;II.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s6">
<label>6.</label>
<title> Conclusion</title>
<p>In this paper, we presented an implementation of a graph neural network algorithm as FPGA firmware with <inline-formula id="inf161">
<mml:math id="m174">
<mml:mrow>
<mml:mi mathvariant="script">O</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mi>&#x03bc;</mml:mi>
<mml:mtext>s</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> execution time. General considerations and challenges in implementing graph neural networks for real-time trigger systems at particle collider experiments are outlined, along with how algorithms such as <sc>GarNet</sc> address these issues. We then described the simplified version of <sc>GarNet</sc>, which is now available as a general-purpose graph network layer in the hls4ml library. An example use case of a machine learning model based on the simplified version of <sc>GarNet</sc>, applied to data from a simulation of a small imaging calorimeter, is presented. The model is able to learn to predict the identity and the energy of the particles detected at the calorimeter with high accuracy, while its firmware implementation executes in 740<inline-formula id="inf162">
<mml:math id="m175">
<mml:mrow>
<mml:mo>&#x2009;</mml:mo>
<mml:mtext>ns</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> and fits easily in a commercially available FPGA. Although the throughput of the firmware is not sufficient to make the model readily deployable in a submicrosecond, real-time collider trigger system, its variants with reduced input size are shown to have higher throughput with reasonable inference performance. These results demonstrate that fast inference of graph neural networks in FPGAs is possible, and with hls4ml, various graph-based machine learning architectures can be automatically translated into firmware.</p>
</sec>
<sec id="s7">
<title>Data Availability Statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found below: <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.3992780%20Zenodo">https://doi.org/10.5281/zenodo.3992780</ext-link>, <ext-link ext-link-type="uri" xlink:href="http://doi:10.5281/zenodo.3992780">doi:10.5281/zenodo.3992780</ext-link>. Simulation data set and the KERAS source code used for the case study are available on the Zenodo platform (<xref ref-type="bibr" rid="B28">Iiyama, 2020</xref>).</p>
</sec>
<sec id="s8">
<title>Author Contributions</title>
<p>All authors listed have made a substantial, direct, and intellectual contribution to the work and approved it for publication.</p>
</sec>
<sec id="s9">
<title>Funding</title>
<p>MP, AG, KW, SS, VL and JN are supported by the European Research Council (ERC) under the European Union&#x2019;s Horizon 2020 research and innovation program (Grant Agreement No. 772369). SJ, ML, KP, and NT are supported by Fermi Research Alliance, LLC under Contract No. DE-AC02-07CH11359 with the U.S. Department of Energy (DOE), Office of Science, Office of High Energy Physics. PH is supported by a Massachusetts Institute of Technology University grant. ZW is supported by the National Science Foundation under Grants Nos. 1606321 and 115164. JD is supported by DOE Office of Science, Office of High Energy Physics Early Career Research program under Award No. DE-SC0021187. CERN has provided the open access publication fee for this&#x0020;paper.</p>
</sec>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
</body>
<back>
<ack>
<p>We acknowledge the Fast Machine Learning collective as an open community of multi-domain experts and collaborators. This community was important for the development of this project.</p>
</ack>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Abadi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Agarwal</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Barham</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Brevdo</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Citro</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>TensorFlow: large-scale machine learning on heterogeneous distributed systems</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://download.tensorflow.org/paper/whitepaper2015.pdf">http://download.tensorflow.org/paper/whitepaper2015.pdf</ext-link>
</comment>. </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Abdughani</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J.&#x0020;M.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Probing stop pair production at the LHC with graph neural networks</article-title>. <source>J.&#x0020;High Energy Phys.</source> <volume>8</volume>, <fpage>55</fpage>. <pub-id pub-id-type="doi">10.1007/JHEP08(2019)055</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Acosta</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Brinkerhoff</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Busch</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Carnes</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Furic</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Gleyzer</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). &#x201c;<article-title>Boosted decision trees in the Level-1 muon endcap trigger at CMS</article-title>,&#x201d; in <conf-name>Proceedings, 18th international workshop on advanced computing and analysis techniques in physics research (ACAT 2017)</conf-name>, <conf-loc>Seattle, WA</conf-loc>, <conf-date>August 21&#x2013;25, 2017</conf-date> (<publisher-loc>Seattle, WA</publisher-loc>: <publisher-name>ACAT</publisher-name>), <fpage>042042</fpage>. <pub-id pub-id-type="doi">10.1088/1742-6596/1085/4/042042</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Agarap</surname>
<given-names>A. F.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Deep learning using rectified linear units (ReLU)</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1803.08375">https://arxiv.org/abs/1803.08375</ext-link>
</comment>. </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Agostinelli</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Allison</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Amako</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Apostolakis</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Araujo</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Arce</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2003</year>). <article-title>Geant4&#x2014;a simulation toolkit</article-title>. <source>Nucl. Instrum. Methods Phys. Res.</source> <volume>506</volume>, <fpage>250</fpage>. <pub-id pub-id-type="doi">10.1016/S0168-9002(03)01368-8</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<collab>ALEPH Collaboration</collab> (<year>1995</year>). <article-title>Performance of the ALEPH detector at LEP</article-title>. <source>Nucl. Instrum. Methods Phys. Res.</source> <volume>360</volume>, <fpage>481</fpage>. <pub-id pub-id-type="doi">10.1016/0168-9002(95)00138-7</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Apollinari</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>B&#x00e9;jar Alonso</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Br&#x00fc;ning</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Fessia</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Lamont</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Rossi</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>High-luminosity large hadron collider (HL-LHC): technical design report V. 0.1</article-title>, <comment>CERN Yellow Reports: Monographs</comment> (<publisher-loc>Geneva, Switzerland</publisher-loc>: <publisher-name>CERN</publisher-name>). <pub-id pub-id-type="doi">10.23731/CYRM-2017-004</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arjona Mart&#x00ed;nez</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cerri</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Pierini</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Spiropulu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Vlimant</surname>
<given-names>J.&#x0020;R.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Pileup mitigation at the Large Hadron Collider with graph neural networks</article-title>. <source>Eur. Phys. J.&#x0020;Plus</source> <volume>134</volume>, <fpage>333</fpage>. <pub-id pub-id-type="doi">10.1140/epjp/i2019-12710-3</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<collab>ATLAS Collaboration</collab> (<year>2017</year>). <article-title>Jet reconstruction and performance using particle flow with the ATLAS detector</article-title>. <source>Eur. Phys. J.&#x0020;C</source> <volume>77</volume>, <fpage>466</fpage>. <pub-id pub-id-type="doi">10.1140/epjc/s10052-017-5031-2</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Auten</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Tomei</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Hardware acceleration of graph neural networks</article-title>,&#x201d; in <conf-name>57th ACM/IEEE Design Automation Conference (DAC)</conf-name>, <conf-loc>San Francisco, CA</conf-loc>, <conf-date>July 20&#x2013;24, 2020</conf-date> (<publisher-loc>San Francisco, CA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1109/DAC18072.2020.9218751</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Bai</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>ONNX: open neural network exchange</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/onnx/onnx">https://github.com/onnx/onnx</ext-link>
</comment> (<comment>Accessed</comment> August 20, 2020). </citation>
</ref>
<ref id="B12">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Battaglia</surname>
<given-names>P. W.</given-names>
</name>
<name>
<surname>Hamrick</surname>
<given-names>J.&#x0020;B.</given-names>
</name>
<name>
<surname>Bapst</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Sanchez-Gonzalez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zambaldi</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Malinowski</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Relational inductive biases, deep learning, and graph networks</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1806.01261">https://arxiv.org/abs/1806.01261</ext-link>
</comment>. </citation>
</ref>
<ref id="B13">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Bernreuther</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Finke</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Kahlhoefer</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Kr&#x00e4;mer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>M&#x00fc;ck</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Casting a graph net to catch dark showers</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2006.08639">https://arxiv.org/abs/2006.08639</ext-link>
</comment>. </citation>
</ref>
<ref id="B14">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Besta</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Stanojevic</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>De Fine Licht</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ben-Nun</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hoefler</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Graph processing on FPGAs: taxonomy, survey, challenges</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1903.06697">https://arxiv.org/abs/1903.06697</ext-link>
</comment>. </citation>
</ref>
<ref id="B15">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Choma</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Monti</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Gerhardt</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Palczewski</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ronaghi</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Prabhat</surname>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Graph neural networks for IceCube signal classification</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2006.10159">https://arxiv.org/abs/2006.10159</ext-link>
</comment>. </citation>
</ref>
<ref id="B16">
<citation citation-type="book">
<collab>CMS Collaboration</collab> (<year>2017a</year>). <article-title>The phase-2 upgrade of the CMS endcap calorimeter</article-title>. <comment>CMS Technical Design Report CERN-LHCC-2017-023. CMS-TDR-019</comment> (<publisher-loc>Geneva, Switzerland</publisher-loc>: <publisher-name>CERN</publisher-name>). </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<collab>CMS Collaboration</collab> (<year>2017b</year>). <article-title>Particle-flow reconstruction and global event description with the CMS detector</article-title>. <source>J.&#x0020;Instrum.</source> <volume>12</volume>, <fpage>P10003</fpage>. <pub-id pub-id-type="doi">10.1088/1748-0221/12/10/P10003</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="book">
<collab>CMS Collaboration</collab> (<year>2020</year>). <article-title>The phase-2 upgrade of the CMS level-1 trigger</article-title>. <comment>CMS Technical Design Report CERN-LHCC-2020-004. CMS-TDR-021</comment> (<publisher-loc>Geneva, Switzerland</publisher-loc>: <publisher-name>CERN</publisher-name>). </citation>
</ref>
<ref id="B19">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Coelho</surname>
<given-names>C. N.</given-names>
</name>
<name>
<surname>Kuusela</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhuang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Aarrestad</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Loncar</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Ngadiuba</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Automatic deep heterogeneous quantization of Deep Neural Networks for ultra low-area, low-latency inference on the edge at particle colliders</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2006.10159">https://arxiv.org/abs/2006.10159</ext-link>
</comment>. </citation>
</ref>
<ref id="B20">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Courbariaux</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>David</surname>
<given-names>J.&#x0020;P.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>BinaryConnect: training deep neural networks with binary weights during propagations</article-title>,&#x201d; in <source>Advances in neural information processing systems 28</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Cortes</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Lawrence</surname>
<given-names>N. D.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>D. D.</given-names>
</name>
<name>
<surname>Sugiyama</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Garnett</surname>
<given-names>R.</given-names>
</name>
</person-group> (<publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>), <fpage>3123</fpage>. </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Di Guglielmo</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Duarte</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Harris</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Hoang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jindariani</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kreinar</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Compressing deep neural networks on FPGAs to binary and ternary precision with hls4ml</article-title>. <source>Mach. Learn. Sci. Technol.</source> <volume>2</volume>, <fpage>015001</fpage>. <pub-id pub-id-type="doi">10.1088/2632-2153/aba042</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Duarte</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Harris</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Jindariani</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kreinar</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Kreis</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Fast inference of deep neural networks in FPGAs for particle physics</article-title>. <source>J.&#x0020;Instrum.</source> <volume>13</volume>, <fpage>07027</fpage>. <pub-id pub-id-type="doi">10.1088/1748-0221/13/07/P07027</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Geng</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>AWB-GCN: a graph convolutional network accelerator with runtime workload rebalancing</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1908.10834">https://arxiv.org/abs/1908.10834</ext-link>
</comment>. </citation>
</ref>
<ref id="B24">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Gray</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Klijnsma</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ghosh</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A dynamic reduction network for point clouds</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2003.08013">https://arxiv.org/abs/2003.08013</ext-link>
</comment>. </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gui</surname>
<given-names>C. Y.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X. Y.</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>X. F.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>A survey on graph processing accelerators: challenges and opportunities</article-title>. <source>J.&#x0020;Comput. Sci. Technol.</source> <volume>34</volume>, <fpage>339</fpage>. <pub-id pub-id-type="doi">10.1007/s11390-019-1914-z</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Harris</surname>
<given-names>C. R.</given-names>
</name>
<name>
<surname>Millman</surname>
<given-names>K. J.</given-names>
</name>
<name>
<surname>van der Walt</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Gommers</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Virtanen</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Cournapeau</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Array programming with NumPy</article-title>. <source>Nature</source> <volume>585</volume>, <fpage>357</fpage>. <pub-id pub-id-type="doi">10.1038/s41586-020-2649-2</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Henrion</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Cranmer</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Bruna</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cho</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Brehmer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Louppe</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). &#x201c;<article-title>Neural message passing for jet physics</article-title>,&#x201d; in <conf-name>Deep learning for physical sciences workshop at the 31st conference on neural information processing systems</conf-name>, <conf-loc>Long Beach, CA</conf-loc>, <conf-date>April 2017</conf-date> (<publisher-loc>Long Beach, CA</publisher-loc>: <publisher-name>NIPS</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>. </citation>
</ref>
<ref id="B28">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Iiyama</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Keras model and weights for GARNET-on-FPGA</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://zenodo.org/record/3992780">https://zenodo.org/record/3992780</ext-link>
</comment>. </citation>
</ref>
<ref id="B29">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Iiyama</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Kieseler</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Simulation of an imaging calorimeter to demonstrate GARNET on FPGA</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://zenodo.org/record/3888910">https://zenodo.org/record/3888910</ext-link>
</comment>. </citation>
</ref>
<ref id="B30">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Jin</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Sz.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>H. H.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Classifying the cosmic-ray proton and light groups on the LHAASO-KM2A experiment with the graph neural network</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1910.07160">https://arxiv.org/abs/1910.07160</ext-link>
</comment>. </citation>
</ref>
<ref id="B31">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Ju</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Farrell</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Calafiura</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Murnane</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Prabhat</surname>
</name>
<name>
<surname>Gray</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Graph neural networks for particle reconstruction in high energy physics detectors</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://ml4physicalsciences.github.io/files/NeurIPS_ML4PS_2019_83.pdf">https://ml4physicalsciences.github.io/files/NeurIPS_ML4PS_2019_83.pdf</ext-link>
</comment>. </citation>
</ref>
<ref id="B32">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kathail</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Xilinx vitis unified software platform</article-title>,&#x201d; in <conf-name>2020 ACM/SIGDA international symposium on field-programmable gate arrays</conf-name>, <conf-loc>New York, NY</conf-loc>, <conf-date>March 2020</conf-date> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>), <fpage>173</fpage>. <pub-id pub-id-type="doi">10.1145/3373087.3375887</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="web">
<collab>Keras Special Interest Group</collab> (<year>2015</year>). <article-title>Keras</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://keras.io">https://keras.io</ext-link>
</comment> (<comment>Accessed</comment> August 20, 2020). </citation>
</ref>
<ref id="B34">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Kieseler</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Object condensation: one-stage grid-free multi-object reconstruction in physics detectors, graph and image data</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2002.03605">https://arxiv.org/abs/2002.03605</ext-link>
</comment>. </citation>
</ref>
<ref id="B35">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Kingma</surname>
<given-names>D. P.</given-names>
</name>
<name>
<surname>Ba</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Adam: a method for stochastic optimization. 3rd international conference for learning representations</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1412.6980">https://arxiv.org/abs/1412.6980</ext-link>
</comment>. </citation>
</ref>
<ref id="B36">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Kiningham</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Re</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Levis</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>GRIP: a graph neural network accelerator architecture</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2007.13828">https://arxiv.org/abs/2007.13828</ext-link>
</comment>. </citation>
</ref>
<ref id="B37">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Kipf</surname>
<given-names>T. N.</given-names>
</name>
<name>
<surname>Welling</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Semi-supervised classification with graph convolutional networks</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://openreview.net/forum?id=SJU4ayYgl">https://openreview.net/forum?id&#x003d;SJU4ayYgl</ext-link>
</comment>. </citation>
</ref>
<ref id="B38">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Loncar</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Tran</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Kreis</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Ngadiuba</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Duarte</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Summers</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>hls-fpga-machine-learning/hls4ml: v0.3.0</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/hls-fpga-machine-learning/hls4ml">https://github.com/hls-fpga-machine-learning/hls4ml</ext-link>
</comment>. </citation>
</ref>
<ref id="B39">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Moons</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Goetschalckx</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Van Berckelaer</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Verhelst</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Minimum energy quantized neural networks</article-title>,&#x201d; in <conf-name>51st Asilomar conference on signals, systems, and computers</conf-name>, <conf-loc>Pacific Grove, CA</conf-loc>, <conf-date>October 29&#x2013;November 1, 2008</conf-date> (<publisher-loc>Pacific Grove, CA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1921</fpage>. </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Moreno</surname>
<given-names>E. A.</given-names>
</name>
<name>
<surname>Cerri</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Duarte</surname>
<given-names>J.&#x0020;M.</given-names>
</name>
<name>
<surname>Newman</surname>
<given-names>H. B.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>T. Q.</given-names>
</name>
<name>
<surname>Periwal</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2020a</year>). <article-title>JEDI-net: a jet identification algorithm based on interaction networks</article-title>. <source>Eur. Phys. J.&#x0020;C</source> <volume>80</volume>, <fpage>58</fpage>. <pub-id pub-id-type="doi">10.1140/epjc/s10052-020-7608-4</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Moreno</surname>
<given-names>E. A.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>T. Q.</given-names>
</name>
<name>
<surname>Vlimant</surname>
<given-names>J.&#x0020;R.</given-names>
</name>
<name>
<surname>Cerri</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Newman</surname>
<given-names>H. B.</given-names>
</name>
<name>
<surname>Periwal</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2020b</year>). <article-title>Interaction networks for the identification of boosted decays</article-title>. <source>Phys. Rev. D</source> <volume>102</volume>, <fpage>012010</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevD.102.012010</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Nurvitadhi</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Weisz</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hurkat</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hoe</surname>
<given-names>J.&#x0020;C.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). &#x201c;<article-title>GraphGen: an FPGA framework for vertex-centric graph computation</article-title>,&#x201d; in <conf-name>2014 IEEE 22nd annual international symposium on field-programmable custom computing machines</conf-name>, <conf-loc>Boston, MA</conf-loc>, <conf-date>May 11&#x2013;13, 2014</conf-date> (<publisher-loc>Boston, MA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>25</fpage>&#x2013;<lpage>28</lpage>. <pub-id pub-id-type="doi">10.1109/FCCM.2014.15</pub-id> </citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ozdal</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Yesil</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ayupov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Greth</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Burns</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Energy efficient architecture for graph analytics accelerators</article-title>. <source>Comput. Archit. News</source> <volume>44</volume>, <fpage>166</fpage>. <pub-id pub-id-type="doi">10.1145/3007787.3001155</pub-id> </citation>
</ref>
<ref id="B44">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>O&#x2019;Loughlin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Coffey</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Callaly</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Lyons</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Morgan</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Xilinx Vivado high level synthesis: case studies</article-title>,&#x201d; in <conf-name>25th IET Irish signals and systems conference 2014 and 2014 China-Ireland international conference on information and communications technologies (ISSC 2014/CIICT 2014)</conf-name>, <conf-loc>Limerick, Ireland</conf-loc>, <conf-date>June 26&#x2013;27, 2014</conf-date> (<publisher-loc>Limerick, Ireland</publisher-loc>: <publisher-name>IET</publisher-name>), <fpage>352</fpage>&#x2013;<lpage>356</lpage>. <pub-id pub-id-type="doi">10.1049/cp.2014.0713</pub-id> </citation>
</ref>
<ref id="B45">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Paszke</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gross</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Massa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Lerer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bradbury</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chanan</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>PyTorch: an imperative style, high-performance deep learning library</article-title>,&#x201d; in <source>Advances in neural information processing systems</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Wallach</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Larochelle</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Beygelzimer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>d&#x2019;Alch&#x00e9; Buc</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Fox</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Garnett</surname>
<given-names>R.</given-names>
</name>
</person-group> (<publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>), <fpage>8026</fpage>. </citation>
</ref>
<ref id="B46">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Qasim</surname>
<given-names>S. R.</given-names>
</name>
<name>
<surname>Kieseler</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Iiyama</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Pierini</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019a</year>). <article-title>caloGraphNN</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/jkiesele/caloGraphNN">https://github.com/jkiesele/caloGraphNN</ext-link>
</comment> (<comment>Accessed</comment> August 20, 2020). </citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qasim</surname>
<given-names>S. R.</given-names>
</name>
<name>
<surname>Kieseler</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Iiyama</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Pierini</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019b</year>). <article-title>Learning representations of irregular particle-detector geometry with distance-weighted graph networks</article-title>. <source>Eur. Phys. J.&#x0020;C</source> <volume>79</volume>, <fpage>608</fpage>. <pub-id pub-id-type="doi">10.1140/epjc/s10052-019-7113-9</pub-id> </citation>
</ref>
<ref id="B48">
<citation citation-type="web">
<collab>Qkeras</collab> (<year>2020</year>). <article-title>Google</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/google/qkeras">https://github.com/google/qkeras</ext-link>
</comment> (<comment>Accessed</comment> August 20, 2020 ). </citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gouskos</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>ParticleNet: jet tagging via particle clouds</article-title>. <source>Phys. Rev. D</source> <volume>101</volume>, <fpage>056019</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevD.101.056019</pub-id> </citation>
</ref>
<ref id="B50">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Shlomi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Battaglia</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Vlimant</surname>
<given-names>J.&#x0020;R.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Graph neural networks in particle physics</article-title>. <source>Machine Learn. Sci. Tech</source>. <pub-id pub-id-type="doi">10.1088/2632-2153/abbf9a</pub-id> </citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Summers</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Di Guglielmo</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Duarte</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Harris</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Hoang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jindariani</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Fast inference of boosted decision trees in FPGAs for particle physics</article-title>. <source>J.&#x0020;Instrum.</source> <volume>15</volume>, <fpage>05026</fpage>. <pub-id pub-id-type="doi">10.1088/1748-0221/15/05/P05026</pub-id> </citation>
</ref>
<ref id="B52">
<citation citation-type="web">
<collab>The HDF Group</collab> (<year>2020</year>). <article-title>Hierarchical data format, version 5 (1997&#x2013;2020)</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.hdfgroup.org/HDF5/">https://www.hdfgroup.org/HDF5/</ext-link>
</comment> (<comment>Accessed</comment> August 20, 2020). </citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>van der Walt</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Colbert</surname>
<given-names>S. C.</given-names>
</name>
<name>
<surname>Varoquaux</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>The NumPy array: a structure for efficient numerical computation</article-title>. <source>Comput. Sci. Eng.</source> <volume>13</volume>, <fpage>22</fpage>. <pub-id pub-id-type="doi">10.1109/MCSE.2011.37</pub-id> </citation>
</ref>
<ref id="B54">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Veli&#x010d;kovi&#x0107;</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Cucurull</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Casanova</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Romero</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Li&#x00f2;</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Graph attention networks</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://openreview.net/forum?id=rJXMpikCZ">https://openreview.net/forum?id&#x003d;rJXMpikCZ</ext-link>
</comment>. </citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Sarma</surname>
<given-names>S. E.</given-names>
</name>
<name>
<surname>Bronstein</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Solomon</surname>
<given-names>J.&#x0020;M.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Dynamic graph CNN for learning on point clouds</article-title>. <source>ACM Trans. Graph.</source> <volume>38</volume>. <pub-id pub-id-type="doi">10.1145/3326362</pub-id> </citation>
</ref>
<ref id="B56">
<citation citation-type="web">
<collab>Xilinx, Inc.</collab> (<year>2020</year>). <article-title>UltraScale FPGA product tables and product selection guide</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.xilinx.com/support/documentation/selection-guides/ultrascale-fpga-product-selection-guide.pdf">https://www.xilinx.com/support/documentation/selection-guides/ultrascale-fpga-product-selection-guide.pdf</ext-link>
</comment> (<comment>Accessed</comment> August 20, 2020). </citation>
</ref>
<ref id="B57">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>HyGCN: a GCN accelerator with hybrid architecture</article-title>,&#x201d; in <conf-name>2020 IEEE International Symposium on High Performance Computer Architecture (HPCA)</conf-name>, <conf-loc>San Diego, CA</conf-loc>, <conf-date>February 2020</conf-date> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>15</fpage>&#x2013;<lpage>29</lpage>. <pub-id pub-id-type="doi">10.1109/HPCA47549.2020.00012</pub-id> </citation>
</ref>
<ref id="B58">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Zeng</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Prasanna</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>GraphACT: accelerating GCN training on CPU-FPGA heterogeneous platforms</article-title>,&#x201d; in <conf-name>2020 ACM/SIGDA international symposium on field-programmable gate arrays</conf-name>, <conf-loc>New York, NY</conf-loc>, <conf-date>April 2020</conf-date> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>), <fpage>255</fpage>. <pub-id pub-id-type="doi">10.1145/3373087.3375312</pub-id> </citation>
</ref>
<ref id="B59">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ni</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>DoReFa-Net: training low bitwidth convolutional neural networks with low bitwidth gradients</article-title>. <comment>[Preprint]. Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1606.06160">https://arxiv.org/abs/1606.06160</ext-link>
</comment>. </citation>
</ref>
<ref id="B60">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Mao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Dally</surname>
<given-names>W. J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Trained ternary quantization</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://openreview.net/pdf?id=S1_pAu9xl">https://openreview.net/pdf?id&#x003d;S1_pAu9xl</ext-link>
</comment>. </citation>
</ref>
</ref-list>
</back>
</article>
