<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurosci.</journal-id>
<journal-title>Frontiers in Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurosci.</abbrev-journal-title>
<issn pub-type="epub">1662-453X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnins.2023.1275944</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Learnable axonal delay in spiking neural networks improves spoken word recognition</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Sun</surname> <given-names>Pengfei</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2152978/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Chua</surname> <given-names>Yansong</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/36075/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Devos</surname> <given-names>Paul</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/962117/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Botteldooren</surname> <given-names>Dick</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/561376/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Information Technology, WAVES Research Group, Ghent University</institution>, <addr-line>Ghent</addr-line>, <country>Belgium</country></aff>
<aff id="aff2"><sup>2</sup><institution>Neuromorphic Computing Laboratory, China Nanhu Academy of Electronics and Information Technology</institution>, <addr-line>Jiaxing</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Lei Deng, Tsinghua University, China</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Yujie Wu, Tsinghua University, China; Alberto Pati&#x000F1;o-Saucedo, Spanish National Research Council (CSIC), Spain; Manolis Sifalakis, Imec, Netherlands; Qi Xu, Dalian University of Technology, China</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Yansong Chua <email>caiyansong&#x00040;cnaeit.com</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>09</day>
<month>11</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>17</volume>
<elocation-id>1275944</elocation-id>
<history>
<date date-type="received">
<day>10</day>
<month>08</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>23</day>
<month>10</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2023 Sun, Chua, Devos and Botteldooren.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Sun, Chua, Devos and Botteldooren</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Spiking neural networks (SNNs), which are composed of biologically plausible spiking neurons, and combined with bio-physically realistic auditory periphery models, offer a means to explore and understand human auditory processing-especially in tasks where precise timing is essential. However, because of the inherent temporal complexity in spike sequences, the performance of SNNs has remained less competitive compared to artificial neural networks (ANNs). To tackle this challenge, a fundamental research topic is the configuration of spike-timing and the exploration of more intricate architectures. In this work, we demonstrate a learnable axonal delay combined with local skip-connections yields state-of-the-art performance on challenging benchmarks for spoken word recognition. Additionally, we introduce an auxiliary loss term to further enhance accuracy and stability. Experiments on the neuromorphic speech benchmark datasets, NTIDIDIGITS and SHD, show improvements in performance when incorporating our delay module in comparison to vanilla feedforward SNNs. Specifically, with the integration of our delay module, the performance on NTIDIDIGITS and SHD improves by 14% and 18%, respectively. When paired with local skip-connections and the auxiliary loss, our approach surpasses both recurrent and convolutional neural networks, yet uses 10 &#x000D7; fewer parameters for NTIDIDIGITS and 7 &#x000D7; fewer for SHD.</p></abstract>
<kwd-group>
<kwd>axonal delay</kwd>
<kwd>spiking neural network</kwd>
<kwd>speech processing</kwd>
<kwd>supervised learning</kwd>
<kwd>auditory modeling</kwd>
<kwd>neuromorphic computing</kwd>
</kwd-group>
<counts>
<fig-count count="7"/>
<table-count count="4"/>
<equation-count count="14"/>
<ref-count count="51"/>
<page-count count="12"/>
<word-count count="7149"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Neuromorphic Engineering</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1. Introduction</title>
<p>Artificial neural networks (ANNs) have excelled in speech-processing tasks, relying on optimization algorithms, deep architectures, and powerful feature extraction methods like MFCC. Nevertheless, these typical feature extraction methods do not fully replicate the biologically realistic model of cochlear processing (Wu et al., <xref ref-type="bibr" rid="B31">2018a</xref>,<xref ref-type="bibr" rid="B33">b</xref>). Additionally, both ANNs and rate-based Spiking Neural Networks (SNNs) struggle with spiking inputs from biologically inspired cochlear models due to their sparse distribution and high temporal complexity (Wu et al., <xref ref-type="bibr" rid="B32">2021</xref>). The high energy consumption of ANNs further limits their deployment in mobile and wearable devices, hindering the development of sound classification systems (Davies et al., <xref ref-type="bibr" rid="B8">2018</xref>; Wu et al., <xref ref-type="bibr" rid="B33">2018b</xref>). Thus, there is a growing demand for bio-inspired SNN architectures capable of handling the outputs of bio-physically realistic cochlear models.</p>
<p>Despite considerable progress in translating insights from non-spiking ANNs to SNNs (Wu et al., <xref ref-type="bibr" rid="B32">2021</xref>; Xu et al., <xref ref-type="bibr" rid="B37">2023a</xref>,<xref ref-type="bibr" rid="B38">b</xref>) and the emergence of enhanced architectures (Xu et al., <xref ref-type="bibr" rid="B40">2018</xref>, <xref ref-type="bibr" rid="B41">2021</xref>, <xref ref-type="bibr" rid="B39">2022</xref>) along with sparse training methods (Shen et al., <xref ref-type="bibr" rid="B20">2023</xref>), the primary application has applied to static datasets or non-stream datasets. While earlier research (Mostafa, <xref ref-type="bibr" rid="B16">2017</xref>; Hong et al., <xref ref-type="bibr" rid="B12">2019</xref>; Zhang et al., <xref ref-type="bibr" rid="B47">2021</xref>) has shown encouraging results on such datasets using temporal encoding algorithms, their potential for large-scale time-series datasets remains a question. Contrastingly, noteworthy advancements has been made by algorithms that directly handle event-driven audio tasks with a temporal dimension (Wu et al., <xref ref-type="bibr" rid="B34">2019</xref>, <xref ref-type="bibr" rid="B35">2020</xref>; Zhang et al., <xref ref-type="bibr" rid="B49">2019</xref>; Blouw and Eliasmith, <xref ref-type="bibr" rid="B4">2020</xref>; Y&#x00131;lmaz et al., <xref ref-type="bibr" rid="B43">2020</xref>). A notable method is the refinement of spike timing precision in models and the exploration of intricate architectures that meld both ANN insights and biological understanding. SNNs, which incorporate adjustable membrane and synaptic time constants (Fang et al., <xref ref-type="bibr" rid="B9">2021</xref>; Perez-Nieves et al., <xref ref-type="bibr" rid="B18">2021</xref>), as well as advanced and optimized firing thresholds (Yin et al., <xref ref-type="bibr" rid="B45">2021</xref>; Yu et al., <xref ref-type="bibr" rid="B46">2022</xref>), have shown substantial promise, especially in integrating precise spike timing to achieve top-tier classification accuracy. Although past methods have placed significant emphasis on the importance of spike-timing, believing that information is intricately embedded within the spatio-temporal structure of spike patterns (Wu et al., <xref ref-type="bibr" rid="B36">2018c</xref>), there has been a conspicuous gap in research concerning the specific effects of event transmission, notably axonal delay (Taherkhani et al., <xref ref-type="bibr" rid="B27">2015</xref>). Neurophysiological studies (Carr and Konishi, <xref ref-type="bibr" rid="B6">1988</xref>; Stoelzel et al., <xref ref-type="bibr" rid="B23">2017</xref>) highlight axonal delay&#x00027;s potential role in triggering varied neuronal responses. It is worth noting that axonal delay is a learnable parameter within the brain, extending beyond the realm of synaptic weights (Seidl, <xref ref-type="bibr" rid="B19">2014</xref>; Talidou et al., <xref ref-type="bibr" rid="B28">2022</xref>). Neuromorphic chips such as SpiNNaker (Furber et al., <xref ref-type="bibr" rid="B10">2014</xref>), IBM TrueNorth (Akopyan et al., <xref ref-type="bibr" rid="B1">2015</xref>), and Intel Loihi (Davies et al., <xref ref-type="bibr" rid="B8">2018</xref>) facilitate the programming of the delay module.</p>
<p>These developments have spurred the exploration of jointly training synaptic weights and axonal delay in deep SNNs. While earlier research mainly centered on fixed delays with trainable weights (Bohte et al., <xref ref-type="bibr" rid="B5">2002</xref>) and the concurrent training of synaptic weights and delays in shallow SNNs featuring a single layer (Taherkhani et al., <xref ref-type="bibr" rid="B27">2015</xref>; Wang et al., <xref ref-type="bibr" rid="B29">2019</xref>; Zhang et al., <xref ref-type="bibr" rid="B48">2020</xref>), there has recently been a degree of investigation into the joint training of the synaptic weights and axonal delays in deep SNNs (Shrestha and Orchard, <xref ref-type="bibr" rid="B21">2018</xref>; Shrestha et al., <xref ref-type="bibr" rid="B22">2022</xref>; Sun et al., <xref ref-type="bibr" rid="B26">2022</xref>, <xref ref-type="bibr" rid="B24">2023a</xref>; Hammouamri et al., <xref ref-type="bibr" rid="B11">2023</xref>; Pati&#x000F1;o-Saucedo et al., <xref ref-type="bibr" rid="B17">2023</xref>). Our prior effort (Sun et al., <xref ref-type="bibr" rid="B26">2022</xref>) stands as one of the initial successful attempts in applying this method to deep SNNs, achieving promising results in tasks characterized by high temporal complexity.</p>
<p>In this current work, we focus on spiking spoken word recognition tasks, namely NTIDIDIGITS (Anumula et al., <xref ref-type="bibr" rid="B2">2018</xref>) and SHD (Cramer et al., <xref ref-type="bibr" rid="B7">2020</xref>). These tasks are temporally complex (Iyer et al., <xref ref-type="bibr" rid="B13">2021</xref>) and are encoded as spikes through an audio-to-spiking conversion procedure inspired by neurophysiology. In pursuit of enhancing these tasks, we introduce a learnable axonal delay mechanism to govern the transmission process and achieve precise synchronization of spike timing. Alongside the axonal delay module, we delved into various intricate structures, showcasing their synergy with the delay module. Specifically, we propose a novel local skip-connection mechanism designed to mitigate information loss during the reset process, an endeavor that relies heavily on the precise availability of spike timing information. Additionally, we integrate an auxiliary loss to curb unwarranted neuron membrane potentials upon firing. Our results underscore the seamless integration of these intricate components with the delay modules, resulting in substantial performance enhancements. Our methods achieve state-of-the-art performance while requiring fewer parameters, as demonstrated by our experimental studies.</p>
<p>The rest of the paper is organized as follows. We provide a detailed description of the proposed methods in Section 2. In Section 3, we demonstrate the effectiveness of our algorithms on two event-based audio data-sets and compare them with other SNNs and ANNs. We conclude and discuss future work in Section 4.</p></sec>
<sec sec-type="materials and methods" id="s2">
<title>2. Materials and methods</title>
<p>In this section, we begin by introducing the spiking neuron model utilized in this work. After that, we present the Variable Axonal Delay (VAD) and Local Skip-Connection methods. The introduction of the Variable Axonal Delay is loosely inspired by neurophysiology, as we argue that the variation of delays observed in the biological system could be advantageous for aligning temporal information on a millisecond time scale. As a result, transient sensory inputs can be condensed into specific spike bursts corresponding to their transience. Next, we introduce the concept of a local skip-connection architecture, which holds the potential to mitigate information loss during the reset mechanism, thereby enhancing the dynamic behavior of the neuron model. Finally, we demonstrate that the suppressed loss further enhances performance, improving the network&#x00027;s discriminative capabilities for target differentiation.</p>
<sec>
<title>2.1. Spiking neuron model</title>
<p>An SNN employs a spiking neuron as the basic computational unit with input and output in the form of spikes, maintaining an internal membrane potential over time. In this paper, we adopt the Spike Response Model (SRM) which phenomenologically describes the dynamic response of biological neurons.</p>
<p>Consider an input spike, <inline-formula><mml:math id="M1"><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x003B4;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. Here <inline-formula><mml:math id="M2"><mml:msubsup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula> denotes a firing time of pre-synaptic neuron <italic>j</italic> in layer <italic>l</italic> &#x02212; 1 and &#x003B4; the spike function. In the SRM model, the incoming spike <inline-formula><mml:math id="M3"><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is converted into spike response signals by convolving with the spike response kernel &#x003F5;(<italic>t</italic>) and is then scaled by the synaptic weight to generate the Post Synaptic Potential (PSP). Likewise, the refractory period can be represented as <inline-formula><mml:math id="M4"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003BD;</mml:mi><mml:mo>*</mml:mo><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> which describes the characteristic recovery time needed before the neuron regains its capacity to fire again after having fired at time <italic>t</italic>. The neuron&#x00027;s membrane potential, is the sum of all PSPs and refractory response
<disp-formula id="E1"><label>(1)</label><mml:math id="M5"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003F5;</mml:mi><mml:mo>*</mml:mo><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003BD;</mml:mi><mml:mo>*</mml:mo><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
where <inline-formula><mml:math id="M6"><mml:msubsup><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is the membrane potential of neuron <italic>i</italic> and <inline-formula><mml:math id="M7"><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:math></inline-formula> is the synaptic weight from neuron <italic>j</italic> to neuron <italic>i</italic>.</p>
<p>A firing output is generated wherever <italic>u</italic><sub><italic>i</italic></sub>(<italic>t</italic>) crosses the predefined firing threshold &#x003B8;<sub><italic>u</italic></sub>. This generation process can be formulated by a Heaviside step function &#x00398; as follows
<disp-formula id="E2"><label>(2)</label><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>&#x00398;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p></sec>
<sec>
<title>2.2. Variable axonal delay (VAD) module</title>
<p>As shown in <xref ref-type="fig" rid="F1">Figure 1</xref>, a VAD is added to the output of each spiking neuron in layer <italic>l</italic>. Let <italic>N</italic> be the number of neurons at layer <italic>l</italic>, thus, the set of spike trains <italic>s</italic><sup><italic>l</italic></sup>(<italic>t</italic>) can be represented as follows
<disp-formula id="E3"><label>(3)</label><mml:math id="M10"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
The forward pass of the delay module can be described as
<disp-formula id="E5"><label>(4)</label><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x003B4;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mover accent="true"><mml:mrow><mml:msup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msup></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>*</mml:mo><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
Where <inline-formula><mml:math id="M13"><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is the set of learnable delays <inline-formula><mml:math id="M14"><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> in layer <italic>l</italic>, and <inline-formula><mml:math id="M15"><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is the spike trains output by the delay module. From the system point of view, limiting the axonal delay of each neuron to a reasonable range can speed up the training convergence. Thus, we clip the delay to the specified range during training and round down after each backpropagation.
<disp-formula id="E6"><label>(5)</label><mml:math id="M16"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>M</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>u</mml:mi><mml:mi>n</mml:mi><mml:mi>d</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
Here, the &#x003B8;<sub><italic>d</italic></sub> refers to the upper bound of the time delay of the spiking neuron.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Illustration of the flow chart of the axonal delay. The output spike is delayed by <inline-formula><mml:math id="M9"><mml:mover accent="true"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula>.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1275944-g0001.tif"/>
</fig></sec>
<sec>
<title>2.3. Local skip-connection as compensation for loss of information due to reset</title>
<p>The structure of the local skip-connection within a given layer is depicted in <xref ref-type="fig" rid="F2">Figure 2</xref>. In mapping from input spikes to output spikes, The SRM utilizes a refractory kernel to characterize the refractory mechanism, represented by the equation <inline-formula><mml:math id="M17"><mml:mi>&#x003BD;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub><mml:mfrac><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x00398;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. One challenge that persists is identifying the ideal refractory scale &#x003B1;<sub><italic>r</italic></sub> for specific tasks. If the refractory scale is too small, its effect is diminished, while an overly large refractory scale risks information loss at certain time junctures. To address this, our study introduces the concept of a local skip-connection. This design compensates for information lost during the reset mechanism in a dynamic fashion. Our results show that this connection can operate effectively using the same refractory scale, offering a solution to the intricate task of selecting an optimal refractory scale for various tasks. The output membrane potential of the local skip-connection can be formulated as
<disp-formula id="E7"><label>(6)</label><mml:math id="M18"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>&#x000FB;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:msubsup><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003F5;</mml:mi><mml:mo>*</mml:mo><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003BD;</mml:mi><mml:mo>*</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x0015D;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p>
<p><inline-formula><mml:math id="M19"><mml:msubsup><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is the locally connected synaptic weight from neuron <italic>j</italic> to neuron <italic>i</italic> at the same layer. Unlike a skip connection, the local skip-connection adds an extra layer of processing to the output spikes generated in layer <italic>l</italic>. It then directs these locally processed output spikes, denoted as &#x0015D;<sup><italic>l</italic></sup> with the same index as the original output spikes <inline-formula><mml:math id="M20"><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, to follow the same axon line within layer <italic>l</italic>. As a result, both the local spike trains &#x0015D;<sup><italic>l</italic></sup> and the original output spikes <inline-formula><mml:math id="M21"><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> utilize the same weights <inline-formula><mml:math id="M22"><mml:msubsup><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> and are channeled to the succeeding layer. This can be equivalently expressed as <inline-formula><mml:math id="M23"><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0015D;</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Flow chart illustrating the proposed methods. In the forward pass, the input spikes are mapped by the SRM, axonal delay module, and local skip-connection to the output spikes. The error consists of the spike rate loss from the last layer and the suppressed loss from the false neuron&#x00027;s membrane potential. The spiking layer consists of the Spiking neuron model and membrane potential layer. The error gradients are passed backward through time to update the weight and axonal delay parameters.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1275944-g0002.tif"/>
</fig></sec>
<sec>
<title>2.4. Loss layer</title>
<p>The loss of an SNN compares the output spikes with the ground truth. However, in classification tasks, decisions are typically made based on the spike due to the absence of precise timing. Considering the spike rate over the time interval <italic>T</italic>, the loss function <italic>L</italic> can be formulated as follows:
<disp-formula id="E8"><label>(7)</label><mml:math id="M24"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle displaystyle="true"><mml:msubsup><mml:mrow><mml:mo>&#x0222B;</mml:mo></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:mstyle><mml:mover accent="false"><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>d</mml:mi><mml:mi>&#x003C4;</mml:mi><mml:mo>-</mml:mo><mml:mstyle displaystyle="true"><mml:msubsup><mml:mrow><mml:mo>&#x0222B;</mml:mo></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup></mml:mstyle><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>d</mml:mi><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
Here, <italic>L</italic> measures the disparity between the target spike train <inline-formula><mml:math id="M25"><mml:mover accent="false"><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> and output spike train <italic>s</italic><sup><italic>nl</italic></sup>(<italic>t</italic>) at the last layer <italic>n</italic><sub><italic>l</italic></sub> across the simulation time <italic>T</italic>. Given the lack of precise spike timing in our tasks, we measure the output spikes through the integration of <inline-formula><mml:math id="M26"><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> over <italic>T</italic>. For different task scenarios, the target firing rate is set as <inline-formula><mml:math id="M27"><mml:msubsup><mml:mrow><mml:mo>&#x0222B;</mml:mo></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>T</mml:mi></mml:mrow></mml:msubsup><mml:mover accent="false"><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>d</mml:mi><mml:mi>&#x003C4;</mml:mi></mml:math></inline-formula>.</p>
<p>To further exploit temporal information in classification, an auxiliary loss termed the suppressed loss <italic>L</italic><sub><italic>Mem</italic></sub> is introduced:
<disp-formula id="E9"><label>(8)</label><mml:math id="M28"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mi>e</mml:mi><mml:mi>m</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac><mml:mo>&#x000B7;</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mo>&#x00394;</mml:mo><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
This loss function is designed to reduce the firing probability of incorrect neurons right when they activate. Compared to previous lateral inhibition methods using learnable or fixed kernels, this loss function achieves a winner-takes-all effect by acting as a regularizer. Importantly, this loss is only applied to false neurons. Here, the spike train <inline-formula><mml:math id="M29"><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula> and membrane potential <inline-formula><mml:math id="M30"><mml:msup><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula> are functions of time. Moreover, <inline-formula><mml:math id="M31"><mml:msup><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mo>&#x00394;</mml:mo><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> refers to the membrane potential right before a spike occurs. When a neuron is activated, indicated by <inline-formula><mml:math id="M32"><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula>, its potential is referred to as <inline-formula><mml:math id="M33"><mml:msup><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mo>&#x00394;</mml:mo><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. This value is then subtracted from a predetermined membrane potential <italic>u</italic><sub>&#x003B8;</sub>, controlled by the suppressing factor &#x003BB;<sub><italic>u</italic></sub> and defined as <italic>u</italic><sub>&#x003B8;</sub> &#x0003D; &#x003BB;<sub><italic>u</italic></sub>&#x003B8;<sub><italic>u</italic></sub>. Lastly, to ensure that the suppressed membrane potential loss is limited only to undesired (or false) neurons, a mask <italic>Mask</italic> &#x02208; &#x0211D;<sup><italic>C</italic></sup> is employed, where <italic>C</italic> is the number of target neurons:
<disp-formula id="E10"><label>(9)</label><mml:math id="M34"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mn>0</mml:mn></mml:mtd><mml:mtd><mml:mtext class="textrm" mathvariant="normal">True Class</mml:mtext></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mn>1</mml:mn></mml:mtd><mml:mtd><mml:mtext class="textrm" mathvariant="normal">False Classes</mml:mtext></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula></p></sec>
<sec>
<title>2.5. Backpropagation</title>
<p>The surrogate gradient algorithm in combination with the Backpropagation-Through-Time (BPTT) (Werbos, <xref ref-type="bibr" rid="B30">1990</xref>) in SNN has shown excellent performance on temporal pattern recognition tasks.</p>
<p>In this work, we discretise the temporal dimension with the sampling time <italic>T</italic><sub><italic>s</italic></sub> such that <italic>t</italic> &#x0003D; <italic>nT</italic><sub><italic>s</italic></sub> where <italic>n</italic> denotes the time step of the simulation. We also define (<italic>N</italic><sub><italic>s</italic></sub> &#x0002B; 1)<italic>T</italic><sub><italic>s</italic></sub> as the total observation time. For the Heaviside step function, we adapt the SLayer function (Shrestha and Orchard, <xref ref-type="bibr" rid="B21">2018</xref>) to formulate the proxy gradient, which is defined as
<disp-formula id="E11"><label>(10)</label><mml:math id="M35"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mover accent="true"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msubsup></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>-</mml:mo><mml:mo>|</mml:mo><mml:mi>u</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mi>&#x003D1;</mml:mi><mml:mo>|</mml:mo><mml:mo>/</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003D1;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
Here, &#x003C4;<sub><italic>scale</italic></sub> and &#x003C4;<sub>&#x003D1;</sub> are two parameters that control the sharpness of the surrogate gradient. Similarly, the gradient of the axonal delay is given by
<disp-formula id="E12"><label>(11)</label><mml:math id="M36"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mo>&#x02207;</mml:mo></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mi>E</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover></mml:mstyle><mml:mfrac><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
Using the chain rule and noting that the loss at time-step <italic>n</italic> depends on all previous timesteps, we get
<disp-formula id="E13"><label>(12)</label><mml:math id="M37"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mo>&#x02207;</mml:mo></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mi>E</mml:mi></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover></mml:mstyle><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mfrac><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:msup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mfrac><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover></mml:mstyle><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mfrac><mml:mrow><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>m</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mfrac><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
Here, the finite difference approximation <inline-formula><mml:math id="M38"><mml:mfrac><mml:mrow><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>m</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:math></inline-formula> is used to numerically estimate the gradient term <inline-formula><mml:math id="M39"><mml:mfrac><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:msubsup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msubsup><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:msup><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:math></inline-formula>. As part of the backpropagation process, the gradient of delay is propagated backward, and then the delay value is subsequently updated. Similarly, we also formulate the gradient term of the suppressed loss.
<disp-formula id="E14"><label>(13)</label><mml:math id="M40"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext>&#x02003;&#x000A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:msub><mml:mrow><mml:mrow><mml:mi mathvariant="script">L</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mi>e</mml:mi><mml:mi>m</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:msup><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:mfrac></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mo>&#x000B7;</mml:mo><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi><mml:mo>&#x000B7;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>u</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
As shown in <xref ref-type="fig" rid="F2">Figure 2</xref>, beginning from the input layer, the spike trains compute forward and the error propagates backward.</p></sec></sec>
<sec id="s3">
<title>3. Experiments and results</title>
<p>In this section, we first evaluate the effectiveness of the proposed delay module and novel architecture on two event-based audio datasets: NTIDIDIGITS and SHD. Additionally, we assess the impact of the novel auxiliary loss in boosting performance. Finally, we compare our results with several state-of-the-art networks, including feedforward SNNs, recurrently connected SNNs (RSNNs), and Recurrent Neural Networks (RNNs).</p>
<sec>
<title>3.1. Implementation details</title>
<p>The experiments are conducted using PyTorch as a framework, and all reported results are obtained on 1 NVIDIA Titan XP GPU. Each network and proposed architecture is trained with the Adam optimizer (Kingma and Ba, <xref ref-type="bibr" rid="B14">2014</xref>) and has the same training cycle. The simulation time step <italic>T</italic><sub><italic>s</italic></sub> is 1 ms, and the firing threshold &#x003B8;<sub><italic>u</italic></sub> is set at 10 mV. The chosen response kernel is <inline-formula><mml:math id="M41"><mml:mi>&#x003F5;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x00398;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, and the refractory kernel is <inline-formula><mml:math id="M42"><mml:mi>&#x003BD;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>&#x003B8;</mml:mi></mml:mrow><mml:mrow><mml:mi>u</mml:mi></mml:mrow></mml:msub><mml:mfrac><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003C4;</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x00398;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. The time constant of the response kernel &#x003C4;<sub><italic>s</italic></sub> and refractory kernel &#x003C4;<sub><italic>r</italic></sub> is set to 5 for NTIDIDIGITS and 1 for SHD datasets. The suppressed factor &#x003BB;<sub><italic>u</italic></sub> is set to 0.995 to suppress the membrane potential of the firing undesired neurons below the threshold. For the proxy gradient, we adopt the Slayer (Shrestha and Orchard, <xref ref-type="bibr" rid="B21">2018</xref>). <xref ref-type="table" rid="T1">Table 1</xref> lists other hyperparameters used.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Detailed hyper-parameter settings.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Hyper-parameter</bold></th>
<th valign="top" align="center"><bold>N-TDIDIGITS18</bold></th>
<th valign="top" align="center"><bold>SHD</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="1">Batch size</td>
<td valign="top" align="center">128</td>
<td valign="top" align="center">128</td>
</tr> <tr>
<td valign="top" align="left" rowspan="1">Learning rate</td>
<td valign="top" align="center">0.1</td>
<td valign="top" align="center">0.1</td>
</tr> <tr>
<td valign="top" align="left" rowspan="1">Time constant &#x003C4;<sub><italic>s</italic></sub></td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">1</td>
</tr> <tr>
<td valign="top" align="left" rowspan="1">Time constant &#x003C4;<sub><italic>r</italic></sub></td>
<td valign="top" align="center">5</td>
<td valign="top" align="center">1</td>
</tr> <tr>
<td valign="top" align="left" rowspan="1">Membrane threshold &#x003B8;<sub><italic>u</italic></sub></td>
<td valign="top" align="center">10</td>
<td valign="top" align="center">10</td>
</tr> <tr>
<td valign="top" align="left" rowspan="1">Refractory scale &#x003B1;<sub><italic>r</italic></sub></td>
<td valign="top" align="center">2</td>
<td valign="top" align="center">2</td>
</tr> <tr>
<td valign="top" align="left" rowspan="1">Delay threshold &#x003B8;<sub><italic>d</italic></sub></td>
<td valign="top" align="center">128</td>
<td valign="top" align="center">64</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="1">Suppressed factor &#x003BB;<sub><italic>u</italic></sub></td>
<td valign="top" align="center">0.995</td>
<td valign="top" align="center">0.995</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>The following notation is used to describe the network architecture: &#x0201C;FC&#x0201D; stands for a fully-connected layer, &#x0201C;VAD&#x0201D; means Variable Axonal Delay module, &#x0201C;Local&#x0201D; denotes the local skip-connection architecture, and <italic>L</italic><sub><italic>Mem</italic></sub> implies the use of the suppressed loss in addition to the spike rate loss. For example, <monospace>Input-128FC-VAD-Local-128FC-VAD-Local-Output &#x0002B; L_{Mem}</monospace> indicates that there are two dense layers with 128 neurons, each implementing the VAD and Local module. The loss is measured by the spike rate and suppressed membrane potential. <xref ref-type="table" rid="T2">Table 2</xref> summarizes the abbreviations for different architectures and methods.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Name and corresponding network structure. L2 denotes the l2 regularizer for delay values.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="center"><bold>Name</bold></th>
<th valign="top" align="center"><bold>Network structure</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">D128-SNN</td>
<td valign="top" align="center">Input-128FC-VAD-128FC-VAD-Output</td>
</tr> <tr>
<td valign="top" align="left">DL128-SNN</td>
<td valign="top" align="center">Input-128FC-VAD-Local-128FC-<break/>VAD-Local-Output</td>
</tr> <tr>
<td valign="top" align="left">DL128-SNN-Dloss</td>
<td valign="top" align="center">Input-128FC-VAD-Local-128FC-VAD-<break/>Local-Output &#x0002B; <italic>L</italic><sub><italic>Mem</italic></sub></td>
</tr> <tr>
<td valign="top" align="left">DL256-SNN-Dloss</td>
<td valign="top" align="center">Input-128FC-VAD-Local-256FC-VAD-<break/>Local-Output &#x0002B; <italic>L</italic><sub><italic>Mem</italic></sub></td>
</tr> <tr>
<td valign="top" align="left">DL128-SNN-Dloss-L2</td>
<td valign="top" align="center">Input-128FC-L2(VAD)-Local-128FC-L2(VAD)-<break/>Local-Output &#x0002B; <italic>L</italic><sub><italic>Mem</italic></sub></td>
</tr></tbody>
</table>
</table-wrap>
<p>The number of spikes generated from the last layer is compared to the desired spikes in dedicated output nodes, serving as the primary loss measurement. In order to implement the suppressed membrane potential loss function, the model is pre-trained for 20 epochs to generate the target spike trains used for <italic>L</italic><sub><italic>Mem</italic></sub> definition. For a fair comparison, all the experiments are run for 5 independent trials, and the average performance and standard deviation are reported.</p></sec>
<sec>
<title>3.2. Datasets</title>
<p>Tests are performed on the speech classification datasets NTIDIDIGITS and Spiking Heidelberg Digits (SHD). Both datasets represent events in the form of spikes, containing rich temporal information that is naturally suited to be directly processed by an SNN. These datasets are considered benchmarks, allowing us to focus on the architecture and learning algorithm of the SNN without considering the spike generation method.</p>
<sec>
<title>3.2.1. NTIDIDIGITS</title>
<p>The NTIDIDIGITS (Anumula et al., <xref ref-type="bibr" rid="B2">2018</xref>) dataset was created by playing the TDIDIGITS (Leonard and Doddington, <xref ref-type="bibr" rid="B15">1993</xref>) to the 64 response channel silicon cochlea. The dataset includes single digits and connected digit sequences, all of which contain the 11 spoken digits (&#x0201C;oh,&#x0201D; and the digits &#x0201C;0&#x0201D; to &#x0201C;9&#x0201D;). For the n-way classification problem (single digits), there are a total of 55 male and 56 female speakers with 2,463 training samples, and 56 male and 53 female speakers in the testing set with a total of 2,486 samples. As shown in <xref ref-type="fig" rid="F3">Figure 3A</xref>, the time resolution is in <italic>ms</italic> level and the channel ranges from 0 to 63.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p><bold>(A)</bold> Raster plot of Spiketrains of input from a single sample (label 0) of the NTIDIDIGITS dataset. The y-axis represents the channels of the cochlear model while the x-axis indicates the time. <bold>(B)</bold> An Illustration of one raw example (word &#x0201C;six&#x0201D;) from the SHD dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1275944-g0003.tif"/>
</fig></sec>
<sec>
<title>3.2.2. SHD</title>
<p>The SHD is the spiking version of the Heidelberg Digits (HD) audio dataset that is converted by a biologically inspired cochlea model (Cramer et al., <xref ref-type="bibr" rid="B7">2020</xref>). There are 8,156 and 2,264 spoken samples for training and testing, respectively. It contains 10-digit utterances from &#x0201C;0&#x0201D; to &#x0201C;9&#x0201D; in English and German, with a total of 20 classes presented by 12 speakers. <xref ref-type="fig" rid="F3">Figure 3B</xref> shows an example of this audio spike stream. Each sample duration ranges from 0.24 to 1.17 s. Here, the time is resampled to speed up the training (Yin et al., <xref ref-type="bibr" rid="B44">2020</xref>). Each channel has at most 1 spike every 4 <italic>ms</italic> and shorter samples are padded with zeros.</p></sec></sec>
<sec>
<title>3.3. Overall results</title>
<p>This section demonstrates the benefits of the proposed innovations and assesses the effects of the VAD, Local skip-connection, and Suppressed loss individually to validate their impact on boosting performance. The basic SNN consists of 2 hidden layers, followed by the VAD module, Local skip-connection in each layer, and the suppressed loss module in the readout layer&#x00027;s membrane potential (<xref ref-type="fig" rid="F2">Figure 2</xref>).</p>
<p>1) NTIDIDIGITS. As shown in <xref ref-type="table" rid="T3">Table 3</xref>, non-spiking approaches such as GRU-RNN and Phased-LSTM (Anumula et al., <xref ref-type="bibr" rid="B2">2018</xref>) achieve 90.90 and 91.25% accuracy, respectively. However, these RNNs rely on the event synthesis algorithm and cannot fully exploit sparse event-based information. Zhang and Li (<xref ref-type="bibr" rid="B50">2019</xref>) directly train the spike-train level features with recurrent layers through the ST-RSBP method, and Zhang and Li (<xref ref-type="bibr" rid="B51">2021</xref>) further propose the SrSc-SNNs architectures that consist of three self-recurrent layers with skip-connections, training this SNN using backpropagation-based intrinsic plasticity, achieving state-of-the-art (SOTA) performance. We show that with the proposed VAD module, local skip-connection, and suppressed loss, our method achieves 95.30% accuracy with a mean of 95.22% and a standard deviation of 0.08%, making it the best result in this classification task. Furthermore, our model uses the least parameters and is 10&#x000D7; smaller compared to the second-best result.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Comparison of classification and parameter count of proposed methods on the NTIDIDIGITS and SHD Test sets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Params</bold></th>
<th valign="top" align="center"><bold>Accuracy (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left" rowspan="5">N-TDIDIGITS18</td>
<td valign="top" align="left">GRU-RNN (Anumula et al., <xref ref-type="bibr" rid="B2">2018</xref>)<xref ref-type="table-fn" rid="TN1"><sup>&#x02020;</sup></xref></td>
<td valign="top" align="center">0.11M</td>
<td valign="top" align="center">90.90</td>
</tr>
<tr>
<td valign="top" align="left">Phased-LSTM (Anumula et al., <xref ref-type="bibr" rid="B2">2018</xref>)<xref ref-type="table-fn" rid="TN1"><sup>&#x02020;</sup></xref></td>
<td valign="top" align="center">0.61M</td>
<td valign="top" align="center">91.25</td>
</tr>
<tr>
<td valign="top" align="left">ST-RSBP (Zhang and Li, <xref ref-type="bibr" rid="B50">2019</xref>)</td>
<td valign="top" align="center">0.35M</td>
<td valign="top" align="center">93.90</td>
</tr>
<tr>
<td valign="top" align="left">SrSc-SNNs-IP (Zhang and Li, <xref ref-type="bibr" rid="B51">2021</xref>)</td>
<td valign="top" align="center">0.61M</td>
<td valign="top" align="center">95.07</td>
</tr>
<tr>
<td valign="top" align="left"><bold>DL128-SNN-Dloss</bold></td>
<td valign="top" align="center"><bold>0.06M</bold></td>
<td valign="top" align="center"><bold>95.22</bold></td>
</tr>
<tr>
<td valign="top" align="left" rowspan="12">SHD</td>
<td valign="top" align="left">Feed-forward SNN (Cramer et al., <xref ref-type="bibr" rid="B7">2020</xref>)</td>
<td valign="top" align="center">0.09M</td>
<td valign="top" align="center">48.1</td>
</tr>
<tr>
<td valign="top" align="left">RSNN (Cramer et al., <xref ref-type="bibr" rid="B7">2020</xref>)</td>
<td valign="top" align="center">1.79M</td>
<td valign="top" align="center">83.2</td>
</tr>
<tr>
<td valign="top" align="left">RSNN with adaption (Yin et al., <xref ref-type="bibr" rid="B44">2020</xref>)</td>
<td valign="top" align="center">0.14M</td>
<td valign="top" align="center">84.40</td>
</tr>
<tr>
<td valign="top" align="left">Heterogeneous RSNN (Perez-Nieves et al., <xref ref-type="bibr" rid="B18">2021</xref>)</td>
<td valign="top" align="center">0.11M</td>
<td valign="top" align="center">82.78</td>
</tr>
<tr>
<td valign="top" align="left">RSNN with attention (Yao et al., <xref ref-type="bibr" rid="B42">2021</xref>)</td>
<td valign="top" align="center">0.14M</td>
<td valign="top" align="center">91.08</td>
</tr>
<tr>
<td valign="top" align="left">DMUC (Sun et al., <xref ref-type="bibr" rid="B25">2023b</xref>)<xref ref-type="table-fn" rid="TN1"><sup>&#x02020;</sup></xref></td>
<td valign="top" align="center">0.24 M</td>
<td valign="top" align="center">91.48%</td>
</tr>
<tr>
<td valign="top" align="left">CNN (Cramer et al., <xref ref-type="bibr" rid="B7">2020</xref>)<xref ref-type="table-fn" rid="TN1"><sup>&#x02020;</sup></xref></td>
<td valign="top" align="center">1.01M</td>
<td valign="top" align="center">92.40</td>
</tr>
<tr>
<td valign="top" align="left">RadLIF (Bittar and Garner, <xref ref-type="bibr" rid="B3">2022</xref>)</td>
<td valign="top" align="center">3.9M</td>
<td valign="top" align="center">94.62</td>
</tr>
<tr>
<td valign="top" align="left">DCLS (Hammouamri et al., <xref ref-type="bibr" rid="B11">2023</xref>)<xref ref-type="table-fn" rid="TN2"><sup>&#x0002A;</sup></xref></td>
<td valign="top" align="center">0.21M</td>
<td valign="top" align="center"><bold>95.07</bold></td>
</tr>
<tr>
<td valign="top" align="left">SNN with delays (Pati&#x000F1;o-Saucedo et al., <xref ref-type="bibr" rid="B17">2023</xref>)</td>
<td valign="top" align="center">0.1M</td>
<td valign="top" align="center">90.04</td>
</tr>
<tr>
<td valign="top" align="left"><bold>DL128-SNN-Dloss</bold></td>
<td valign="top" align="center">0.14M</td>
<td valign="top" align="center">92.56</td>
</tr>
<tr>
<td valign="top" align="left"><bold>DL256-SNN-Dloss</bold></td>
<td valign="top" align="center">0.21M</td>
<td valign="top" align="center">93.55</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="TN1"><label>&#x02020;</label><p>Non-SNN implementation.</p></fn>
<fn id="TN2"><label>&#x0002A;</label><p>Channel reduction. Bold values are the best results.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>2) SHD. For this dataset, we compare our methods with recent advancements. In Cramer et al. (<xref ref-type="bibr" rid="B7">2020</xref>), the single feed-forward SNN and Recurrent SNN are both trained using BPTT. Their results show that the recurrent architecture outperforms the homogeneous feed-forward architecture in this challenging work, underscoring the potential advantages of intricate SNN designs. Several studies have ventured into specialized SNN architectures. For instance, some explore the effectiveness of the heterogeneous recurrent SNNs (Perez-Nieves et al., <xref ref-type="bibr" rid="B18">2021</xref>), while others delved into attention-based SNNs (Yao et al., <xref ref-type="bibr" rid="B42">2021</xref>). As detailed in <xref ref-type="table" rid="T3">Table 3</xref>, our proposed method produces a competitive performance of 92.56% in a two-layer fully connected network of 128 neurons each. Notably, this performance is competitive compared to these results that employ the same data processing methods and network architecture. Pati&#x000F1;o-Saucedo et al. (<xref ref-type="bibr" rid="B17">2023</xref>) introduce axonal delays in tandem with learnable time constants, enabling a reduction in model size to a mere 0.1 M while preserving competitive performance.</p>
<p>Additionally, RadLIF (Bittar and Garner, <xref ref-type="bibr" rid="B3">2022</xref>) combines an adaptive linear LIF neuron with the SG strategy, achieving a performance of 94.62%. This achievement is realized through the utilization of three recurrent spiking layers, each containing 1024 neurons. On the other hand, DCLS, introduced in Hammouamri et al.&#x00027;s research (Hammouamri et al., <xref ref-type="bibr" rid="B11">2023</xref>), capitalizes on several key innovations. It incorporates learnable position adjustments within the kernel, employs advanced data augmentation techniques (like the 5-channel binning), and incorporates batch normalization methods. As a result, DCLS achieves an accuracy of 95.07% using two feedforward spiking layers, each comprising 256 neurons. Given the sizeable 700-input channel, we mitigated extensive parameter expansion by augmenting the neural network&#x00027;s second layer from 128 to 256 neurons. This strategic adjustment significantly improved performance, yielding a 93.55% accuracy rate.</p></sec>
<sec>
<title>3.4. Ablation study</title>
<p>We delve into the contributions of VAD, Local skip-connection, and Suppressed loss via a comprehensive ablation study (refer to <xref ref-type="table" rid="T4">Table 4</xref>). Evaluating each method individually on two fully-connected feed-forward SNNs provides the following insights:</p>
<list list-type="bullet">
<list-item><p><bold>VAD:</bold> When incorporated, there is a marked enhancement in the accuracy across datasets. Specifically, with the delay module embedded (in the D128-SNN setup), we obtain gains of 14.47% and 18.68% for NTIDIDIGITS and SHD, respectively. Importantly, despite these advancements, the parameters remain nearly unchanged. This is attributed to our adoption of channel-wise delays, implying that the increase in parameters corresponds only to the number of channels in each layer. As an illustration, with the SHD dataset, the integration of VAD results in an increment of <italic>N</italic> parameters in each layer, with <italic>N</italic> being set to 128 in our experimental setup.</p></list-item>
<list-item><p><bold>Local skip-connection:</bold> Its standalone application (reflected in the Input-128FC-Local-128FC-Local-11 design) does not bolster accuracy notably. For the SHD dataset, the outcome is even slightly detrimental. However, this method increases the number of trainable parameters. This can be likened to the addition of an extra feedforward layer, resulting in a parameter increment of <italic>N</italic> &#x000D7; <italic>N</italic> for each layer.</p></list-item>
</list>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Ablation studies for different architecture and learning methods.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:&#x00023;919498;color:&#x00023;ffffff">
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>Network</bold></th>
<th valign="top" align="center"><bold>Params</bold></th>
<th valign="top" align="center"><bold>Accuracy (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left" rowspan="6">NTIDIDIGITS</td>
<td valign="top" align="left">Input-128FC-128FC-11</td>
<td valign="top" align="center">26,251</td>
<td valign="top" align="center">78.52</td>
</tr>
<tr>
<td valign="top" align="left">Input-128FC-Local-128FC-Local-11</td>
<td valign="top" align="center">59,275</td>
<td valign="top" align="center">79.36</td>
</tr>
<tr>
<td valign="top" align="left">D128-SNN</td>
<td valign="top" align="center">26,507</td>
<td valign="top" align="center">92.99</td>
</tr>
<tr>
<td valign="top" align="left">DL128-SNN</td>
<td valign="top" align="center">59,531</td>
<td valign="top" align="center">94.70 &#x000B1; 0.35</td>
</tr>
<tr>
<td valign="top" align="left">DL128-SNN-Dloss</td>
<td valign="top" align="center">59,531</td>
<td valign="top" align="center">95.22 &#x000B1; 0.08</td>
</tr>
<tr>
<td valign="top" align="left">DL128-SNN-Dloss-L2</td>
<td valign="top" align="center">59,531</td>
<td valign="top" align="center">94.85 &#x000B1; 0.08</td>
</tr>
<tr>
<td valign="middle" align="left" rowspan="6">SHD</td>
<td valign="top" align="left">Input-128FC-128FC-20</td>
<td valign="top" align="center">108,820</td>
<td valign="top" align="center">67.05</td>
</tr>
<tr>
<td valign="top" align="left">Input-128FC-Local-128FC-Local-20</td>
<td valign="top" align="center">141,844</td>
<td valign="top" align="center">65.55</td>
</tr>
<tr>
<td valign="top" align="left">D128-SNN</td>
<td valign="top" align="center">109,076</td>
<td valign="top" align="center">85.73</td>
</tr>
<tr>
<td valign="top" align="left">DL128-SNN</td>
<td valign="top" align="center">142,100</td>
<td valign="top" align="center">91.52 &#x000B1; 0.84</td>
</tr>
<tr>
<td valign="top" align="left">DL128-SNN-Dloss</td>
<td valign="top" align="center">142,100</td>
<td valign="top" align="center">92.56 &#x000B1; 0.56</td>
</tr>
<tr>
<td valign="top" align="left">DL128-SNN-Dloss-L2</td>
<td valign="top" align="center">142,100</td>
<td valign="top" align="center">92.44 &#x000B1; 0.09</td>
</tr></tbody>
</table>
</table-wrap>
<p>Combining VAD and Local skip-connection in the DL128-SNN design yields significant benefits. We clinch state-of-the-art accuracy levels for both datasets. This highlights that the enhanced flexibility provided by VAD truly shines when paired with a richer parameter landscape, as provided by the Local skip-connection. Lastly, supplementing the above with the suppressed loss, Dloss, results in stellar performance: 95.22% for NTIDIDIGITS and 92.56% for SHD.</p></sec>
<sec>
<title>3.5. Axonal delay improves the characterization learning ability</title>
<p>In this section, we begin by offering a visual representation of the axonal delay distribution (refer to <xref ref-type="fig" rid="F4">Figure 4</xref>) for both datasets. Subsequently, we employ an L2 regularizer on the delay to curtail the magnitude of delay values, effectively reducing the number of delayed time steps.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Distribution of time delay on <bold>(A)</bold> NTIDIDIGITS, <bold>(B)</bold> SHD. The initial distribution are all 0. From the left to right: First layer, second layer.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1275944-g0004.tif"/>
</fig>
<p>Utilizing the NTIDIDIGITS dataset as an illustrative example, <xref ref-type="fig" rid="F4">Figure 4A</xref> reveals a delay distribution in the first layer that consistently encompasses both long and short delay neurons. This may imply that certain neurons focus on the initial portion of the input, whereas others concentrate on the latter segment of the input features. To understand the dynamics of the VAD, we inspect the cumulative spike count at the input of the network and compare it to the cumulative spike count at the true decision neuron for four different models, as depicted in <xref ref-type="fig" rid="F5">Figure 5</xref>. For illustrative purposes, we select four different English-speaking digit utterances: &#x0201C;1&#x0201D;, &#x0201C;6&#x0201D;, &#x0201C;7&#x0201D;, and &#x0201C;10&#x0201D;. The figures clearly show that the model without delay gradually increases its prediction as the input spikes come in and starts to do so as soon as input spikes start arriving. Conversely, for the other three models equipped with delay modules, the decision to increase spike count in the true neuron is delayed but then increases more quickly and reaches a higher level. This phenomenon arises from the different neurons introducing varying delays to the spikes, thereby providing the terminal neuron with multi-scale information. This may be interpreted as the VAD-enabled network aggregating all information in the spoken word before triggering a decision using all that information simultaneously. Moreover, we can observe that the models with delay typically have a total of 60 time step latency, which can be measured after the input is over. This is not only related to the delay itself but also to the choice of loss evaluation. As Shrestha et al. (<xref ref-type="bibr" rid="B22">2022</xref>) discussed, the spike-based negative log-likelihood loss results in early classification, even 1400 time steps faster than spike-rate based loss evaluation for NTIDIDIGITS datasets. However, the DL-128-SNN-Dloss generates the highest number of spikes for the true neuron compared to the other models, demonstrating its superior ability to learn characterizations.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>Illustration of 4 distinct English examples (&#x0201C;1&#x0201D;, &#x0201C;6&#x0201D;, &#x0201C;7&#x0201D;, and &#x0201C;10&#x0201D;). The cumulative spike count of the input is plotted on the right y-axis (represented by the blue line), while the true neurons&#x00027; cumulative spike count is on the left y-axis. Four models are showcased:: No delay, D128-SNN, DL128-SNN, and DL128-SNN-Dloss.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1275944-g0005.tif"/>
</fig>
<p>Subsequently, the L2 loss is employed to confine the range of delay values to provide a more uniform distribution. This leads to a reduction in delay values for some neurons (see <xref ref-type="fig" rid="F6">Figure 6</xref>), aiming to reduce the total latency and investigate whether shorter delays contribute to a better classification system. This is achieved by applying the L2 regularizer to <inline-formula><mml:math id="M43"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. Nevertheless, as demonstrated in <xref ref-type="table" rid="T4">Table 4</xref>, the inclusion of the additional L2 loss results in a performance decline. This could indicate that the learned distributions achieved through these architectures may already be optimal within the current delay threshold, denoted as &#x003B8;<sub><italic>d</italic></sub>.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>Application of the L2 regularizer on the distribution of time delay for <bold>(A)</bold> NTIDIDIGITS, <bold>(B)</bold> SHD. The initial distribution are all 0. From the left to right: First layer, second layer.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1275944-g0006.tif"/>
</fig></sec>
<sec>
<title>3.6. Local skip-connection as compensation for loss of information in reset mechanism</title>
<p>The positive impact of local skip-connections on the reset mechanism becomes evident when modulating the refractory scale, symbolized as &#x003B1;<sub><italic>r</italic></sub>. We conduct a comparative analysis of performance between two distinct configurations: one labeled as VAD, which encompasses solely the delay model, and the other designated as VAD&#x0002B;Local, which additionally incorporates local skip-connections. As shown in <xref ref-type="fig" rid="F7">Figure 7</xref>, the Local skip-connection maintains high performance across a wider range of refractory scales &#x003B1;<sub><italic>r</italic></sub>, while the performance with only the VAD module starts to decline with high values. This observation aligns with our earlier conjecture that larger values of &#x003B1;<sub><italic>r</italic></sub> may induce information loss, as the neuron&#x00027;s potential struggles to recover efficiently. In contrast, the presence of local connections mitigates this loss by dynamically triggering spiking events among local neurons. Thus, our Local skip-connection diminishes sensitivity to parameter selection, potentially providing more flexibility to train SNNs for varied tasks, indicating that a consistent alpha value can be effective for different tasks.</p>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>The influence of the different refractory scale &#x003B1;<sub><italic>r</italic></sub> on accuracy is examined under &#x0201C;VAD&#x0201D; and &#x0201C;VAD&#x0002B;Local&#x0201D; architecture. &#x0201C;VAD&#x0201D; refers to the performance of using only the VAD module, while &#x0201C;VAD&#x0002B;Local&#x0201D; represents the performance using both VAD and local skip-connections. <bold>(A)</bold> NTIDIDIGITS dataset. <bold>(B)</bold> SHD dataset. For this experiment, we use two dense layers with 128 neurons.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-17-1275944-g0007.tif"/>
</fig></sec></sec>
<sec sec-type="conclusions" id="s4">
<title>4. Conclusion</title>
<p>In this study, we introduce several innovative components aimed at enhancing the performance of Spiking Neural Networks (SNNs): the learnable axonal delay module, combined with a local skip connection architecture, and augmented with an auxiliary suppressed loss. The variable axonal delay module plays a pivotal role in aligning spike timing, thereby enhancing the network&#x00027;s capacity for representation. The local skip-connection mechanism compensates for the information loss during the reset process. This enhances network dynamics and reduces the sensitivity to refractory scale tuning, making it more versatile. The inclusion of the suppressed loss works to suppress erroneous neuron firing, facilitating the SNN in making more accurate label distinctions. Importantly, these methods can be seamlessly integrated into the existing framework through the use of backpropagation algorithms.</p>
<p>We demonstrate that the proposed methods boost performance on two benchmark event-based speech datasets with the fewest parameters. Our methods highlight the immense potential of employing them in tandem with a cochlear front-end that encodes features of auditory inputs using spikes, creating a robust bio-inspired system. Our work emphasizes the importance of delving into different dynamic SNN architectures and learning algorithms for tasks involving datasets with rich temporal complexity.</p>
<p>In future work, it will be interesting to investigate the spike count distribution per layer and the total computational cost. Additionally, more exploration could be focused on latency by studying the influence of different loss evaluations and dynamic caps for axonal delays. Since current work mainly focuses on cochlear features with a bio-inspired approach, it would also be intriguing to apply these methods to visual tasks that involve inherent temporal information.</p></sec>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p></sec>
<sec sec-type="author-contributions" id="s6">
<title>Author contributions</title>
<p>PS: Conceptualization, Investigation, Software, Validation, Writing&#x02014;original draft, Writing&#x02014;review &#x00026; editing. YC: Conceptualization, Investigation, Supervision, Writing&#x02014;review &#x00026; editing. PD: Supervision, Writing&#x02014;review &#x00026; editing, Conceptualization, Investigation. DB: Conceptualization, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Writing&#x02014;review &#x00026; editing.</p></sec>
</body>
<back>
<sec sec-type="funding-information" id="s7">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was supported in part by the Flemish Government under the &#x0201C;Onderzoeksprogramma Artifici&#x000EB;le Intelligentie (AI) Vlaanderen&#x0201D; and the Research Foundation - Flanders under Grant Number G0A0220N (FWO WithMe project). The work of YC was supported in part by the National Key Research and Development Program of China (Grant No. 2021ZD0200300).</p>
</sec>
<ack><p>The authors would express our very great appreciation to Sumit Bam Shrestha for his valuable and constructive suggestions and technical support during the development of this research work.</p>
</ack>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
<p>The author(s) declared that they were an editorial board member of Frontiers, at the time of submission. This had no impact on the peer review process and the final decision.</p>
</sec>
<sec sec-type="disclaimer" id="s8">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Akopyan</surname> <given-names>F.</given-names></name> <name><surname>Sawada</surname> <given-names>J.</given-names></name> <name><surname>Cassidy</surname> <given-names>A.</given-names></name> <name><surname>Alvarez-Icaza</surname> <given-names>R.</given-names></name> <name><surname>Arthur</surname> <given-names>J.</given-names></name> <name><surname>Merolla</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>TrueNorth: design and tool flow of a 65 mw 1 million neuron programmable neurosynaptic chip</article-title>. <source>IEEE Trans. Comput. Aided Design Integr. Circ. Syst.</source> <volume>34</volume>, <fpage>1537</fpage>&#x02013;<lpage>1557</lpage>. <pub-id pub-id-type="doi">10.1109/TCAD.2015.2474396</pub-id></citation></ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Anumula</surname> <given-names>J.</given-names></name> <name><surname>Neil</surname> <given-names>D.</given-names></name> <name><surname>Delbruck</surname> <given-names>T.</given-names></name> <name><surname>Liu</surname> <given-names>S.-C.</given-names></name></person-group> (<year>2018</year>). <article-title>Feature representations for neuromorphic audio spike streams</article-title>. <source>Front. Neurosci.</source> <volume>12</volume>, <fpage>23</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2018.00023</pub-id><pub-id pub-id-type="pmid">29479300</pub-id></citation></ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bittar</surname> <given-names>A.</given-names></name> <name><surname>Garner</surname> <given-names>P. N.</given-names></name></person-group> (<year>2022</year>). <article-title>A surrogate gradient spiking baseline for speech command recognition</article-title>. <source>Front. Neurosci.</source> <volume>16</volume>, <fpage>865897</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2022.865897</pub-id><pub-id pub-id-type="pmid">36117617</pub-id></citation></ref>
<ref id="B4">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Blouw</surname> <given-names>P.</given-names></name> <name><surname>Eliasmith</surname> <given-names>C.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Event-driven signal processing with neuromorphic computing systems,&#x0201D;</article-title> in <source>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>8534</fpage>&#x02013;<lpage>8538</lpage>.</citation></ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bohte</surname> <given-names>S. M.</given-names></name> <name><surname>Kok</surname> <given-names>J. N.</given-names></name> <name><surname>La Poutre</surname> <given-names>H.</given-names></name></person-group> (<year>2002</year>). <article-title>Error-backpropagation in temporally encoded networks of spiking neurons</article-title>. <source>Neurocomputing</source> <volume>48</volume>, <fpage>17</fpage>&#x02013;<lpage>37</lpage>. <pub-id pub-id-type="doi">10.1016/S0925-2312(01)00658-0</pub-id></citation></ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Carr</surname> <given-names>C. E.</given-names></name> <name><surname>Konishi</surname> <given-names>M.</given-names></name></person-group> (<year>1988</year>). <article-title>Axonal delay lines for time measurement in the owl&#x00027;s brainstem</article-title>. <source>Proc. Natl. Acad. Sci. U.S.A.</source> <volume>85</volume>, <fpage>8311</fpage>&#x02013;<lpage>8315</lpage>.<pub-id pub-id-type="pmid">3186725</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cramer</surname> <given-names>B.</given-names></name> <name><surname>Stradmann</surname> <given-names>Y.</given-names></name> <name><surname>Schemmel</surname> <given-names>J.</given-names></name> <name><surname>Zenke</surname> <given-names>F.</given-names></name></person-group> (<year>2020</year>). <article-title>The Heidelberg spiking data sets for the systematic evaluation of spiking neural networks</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst</source>. <volume>33</volume>, <fpage>2744</fpage>&#x02013;<lpage>2757</lpage>.<pub-id pub-id-type="pmid">33378266</pub-id></citation></ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Davies</surname> <given-names>M.</given-names></name> <name><surname>Srinivasa</surname> <given-names>N.</given-names></name> <name><surname>Lin</surname> <given-names>T.-H.</given-names></name> <name><surname>Chinya</surname> <given-names>G.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Choday</surname> <given-names>S. H.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>LOIHI: a neuromorphic manycore processor with on-chip learning</article-title>. <source>IEEE Micro</source> <volume>38</volume>, <fpage>82</fpage>&#x02013;<lpage>99</lpage>. <pub-id pub-id-type="doi">10.1109/MM.2018.112130359</pub-id></citation></ref>
<ref id="B9">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Fang</surname> <given-names>W.</given-names></name> <name><surname>Yu</surname> <given-names>Z.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Masquelier</surname> <given-names>T.</given-names></name> <name><surname>Huang</surname> <given-names>T.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Incorporating learnable membrane time constant to enhance learning of spiking neural networks,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>2661</fpage>&#x02013;<lpage>2671</lpage>.</citation></ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Furber</surname> <given-names>S. B.</given-names></name> <name><surname>Galluppi</surname> <given-names>F.</given-names></name> <name><surname>Temple</surname> <given-names>S.</given-names></name> <name><surname>Plana</surname> <given-names>L. A.</given-names></name></person-group> (<year>2014</year>). <article-title>The spinnaker project</article-title>. <source>Proc. IEEE</source> <volume>102</volume>, <fpage>652</fpage>&#x02013;<lpage>665</lpage>. <pub-id pub-id-type="doi">10.1109/JPROC.2014.2304638</pub-id></citation></ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hammouamri</surname> <given-names>I.</given-names></name> <name><surname>Khalfaoui-Hassani</surname> <given-names>I.</given-names></name> <name><surname>Masquelier</surname> <given-names>T.</given-names></name></person-group> (<year>2023</year>). <article-title>Learning delays in spiking neural networks using dilated convolutions with learnable spacings</article-title>. <source>arXiv preprint arXiv:2306.17670</source>.</citation></ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hong</surname> <given-names>C.</given-names></name> <name><surname>Wei</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Deng</surname> <given-names>B.</given-names></name> <name><surname>Yu</surname> <given-names>H.</given-names></name> <name><surname>Che</surname> <given-names>Y.</given-names></name></person-group> (<year>2019</year>). <article-title>Training spiking neural networks for cognitive tasks: a versatile framework compatible with various temporal codes</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>31</volume>, <fpage>1285</fpage>&#x02013;<lpage>1296</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2019.2919662</pub-id><pub-id pub-id-type="pmid">31247574</pub-id></citation></ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Iyer</surname> <given-names>L. R.</given-names></name> <name><surname>Chua</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name></person-group> (<year>2021</year>). <article-title>Is neuromorphic MNIST neuromorphic? Analyzing the discriminative power of neuromorphic datasets in the time domain</article-title>. <source>Front. Neurosci.</source> <volume>15</volume>, <fpage>608567</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2021.608567</pub-id><pub-id pub-id-type="pmid">33841072</pub-id></citation></ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kingma</surname> <given-names>D. P.</given-names></name> <name><surname>Ba</surname> <given-names>J.</given-names></name></person-group> (<year>2014</year>). <article-title>Adam: a method for stochastic optimization</article-title>. <source>arXiv preprint arXiv:1412.6980</source>.</citation></ref>
<ref id="B15">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Leonard</surname> <given-names>R. G.</given-names></name> <name><surname>Doddington</surname> <given-names>G.</given-names></name></person-group> (<year>1993</year>). <source>Tidigits Speech Corpus</source>. <publisher-loc>IEEE</publisher-loc>: <publisher-name>Texas Instruments, Inc</publisher-name>.</citation></ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mostafa</surname> <given-names>H.</given-names></name></person-group> (<year>2017</year>). <article-title>Supervised learning based on temporal coding in spiking neural networks</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>29</volume>, <fpage>3227</fpage>&#x02013;<lpage>3235</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2017.2726060</pub-id><pub-id pub-id-type="pmid">28783639</pub-id></citation></ref>
<ref id="B17">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Pati&#x000F1;o-Saucedo</surname> <given-names>A.</given-names></name> <name><surname>Yousefzadeh</surname> <given-names>A.</given-names></name> <name><surname>Tang</surname> <given-names>G.</given-names></name> <name><surname>Corradi</surname> <given-names>F.</given-names></name> <name><surname>Linares-Barranco</surname> <given-names>B.</given-names></name> <name><surname>Sifalakis</surname> <given-names>M.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Empirical study on the efficiency of spiking neural networks with axonal delays, and algorithm-hardware benchmarking,&#x0201D;</article-title> in <source>2023 IEEE International Symposium on Circuits and Systems (ISCAS)</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>5</lpage>.</citation></ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Perez-Nieves</surname> <given-names>N.</given-names></name> <name><surname>Leung</surname> <given-names>V. C.</given-names></name> <name><surname>Dragotti</surname> <given-names>P. L.</given-names></name> <name><surname>Goodman</surname> <given-names>D. F.</given-names></name></person-group> (<year>2021</year>). <article-title>Neural heterogeneity promotes robust learning</article-title>. <source>Nat. Commun.</source> <volume>12</volume>, <fpage>1</fpage>&#x02013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1038/s41467-021-26022-3</pub-id><pub-id pub-id-type="pmid">34608134</pub-id></citation></ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Seidl</surname> <given-names>A. H.</given-names></name></person-group> (<year>2014</year>). <article-title>Regulation of conduction time along axons</article-title>. <source>Neuroscience</source> <volume>276</volume>, <fpage>126</fpage>&#x02013;<lpage>134</lpage>. <pub-id pub-id-type="doi">10.1016/j.neuroscience.2013.06.047</pub-id><pub-id pub-id-type="pmid">23820043</pub-id></citation></ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Xu</surname> <given-names>Q.</given-names></name> <name><surname>Liu</surname> <given-names>J. K.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Pan</surname> <given-names>G.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name></person-group> (<year>2023</year>). <article-title>ESL-SNNs: an evolutionary structure learning strategy for spiking neural networks</article-title>. <source>arXiv preprint arXiv:2306.03693</source>.</citation></ref>
<ref id="B21">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Shrestha</surname> <given-names>S. B.</given-names></name> <name><surname>Orchard</surname> <given-names>G.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;SLAYER: spike layer error reassignment in time,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems 31</source> (<publisher-loc>IEEE</publisher-loc>).</citation></ref>
<ref id="B22">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Shrestha</surname> <given-names>S. B.</given-names></name> <name><surname>Zhu</surname> <given-names>L.</given-names></name> <name><surname>Sun</surname> <given-names>P.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Spikemax: spike-based loss methods for classification,&#x0201D;</article-title> in <source>2022 International Joint Conference on Neural Networks (IJCNN)</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>7</lpage>.</citation></ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Stoelzel</surname> <given-names>C. R.</given-names></name> <name><surname>Bereshpolova</surname> <given-names>Y.</given-names></name> <name><surname>Alonso</surname> <given-names>J.-M.</given-names></name> <name><surname>Swadlow</surname> <given-names>H. A.</given-names></name></person-group> (<year>2017</year>). <article-title>Axonal conduction delays, brain state, and corticogeniculate communication</article-title>. <source>J. Neurosci.</source> <volume>37</volume>, <fpage>6342</fpage>&#x02013;<lpage>6358</lpage>. <pub-id pub-id-type="doi">10.1523/JNEUROSCI.0444-17.2017</pub-id><pub-id pub-id-type="pmid">28559382</pub-id></citation></ref>
<ref id="B24">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>P.</given-names></name> <name><surname>Eqlimi</surname> <given-names>E.</given-names></name> <name><surname>Chua</surname> <given-names>Y.</given-names></name> <name><surname>Devos</surname> <given-names>P.</given-names></name> <name><surname>Botteldooren</surname> <given-names>D.</given-names></name></person-group> (<year>2023a</year>). <article-title>&#x0201C;Adaptive axonal delays in feedforward spiking neural networks for accurate spoken word recognition,&#x0201D;</article-title> in <source>ICASSP 2023-2023 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>5</lpage>.</citation></ref>
<ref id="B25">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>P.</given-names></name> <name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Devos</surname> <given-names>P.</given-names></name> <name><surname>Botteldooren</surname> <given-names>D.</given-names></name></person-group> (<year>2023b</year>). <article-title>Delayed memory unit: modelling temporal dependency through delay gate</article-title>. <source>arXiv preprint arXiv:2310.14982</source>.</citation></ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>P.</given-names></name> <name><surname>Zhu</surname> <given-names>L.</given-names></name> <name><surname>Botteldooren</surname> <given-names>D.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Axonal delay as a short-term memory for feed forward deep spiking neural networks,&#x0201D;</article-title> in <source>ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source>, <fpage>8932</fpage>&#x02013;<lpage>8936</lpage>.</citation></ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Taherkhani</surname> <given-names>A.</given-names></name> <name><surname>Belatreche</surname> <given-names>A.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Maguire</surname> <given-names>L. P.</given-names></name></person-group> (<year>2015</year>). <article-title>DL-resume: a delay learning-based remote supervised method for spiking neurons</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>26</volume>, <fpage>3137</fpage>&#x02013;<lpage>3149</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2015.2404938</pub-id><pub-id pub-id-type="pmid">25794401</pub-id></citation></ref>
<ref id="B28">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Talidou</surname> <given-names>A.</given-names></name> <name><surname>Frankland</surname> <given-names>P. W.</given-names></name> <name><surname>Mabbott</surname> <given-names>D.</given-names></name> <name><surname>Lefebvre</surname> <given-names>J.</given-names></name></person-group> (<year>2022</year>). <article-title>Homeostatic coordination and up-regulation of neural activity by activity-dependent myelination</article-title>. <source>Nat. Comput. Sci.</source> <volume>2</volume>, <fpage>665</fpage>&#x02013;<lpage>676</lpage>. <pub-id pub-id-type="doi">10.1038/s43588-022-00315-z</pub-id></citation></ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Lin</surname> <given-names>X.</given-names></name> <name><surname>Dang</surname> <given-names>X.</given-names></name></person-group> (<year>2019</year>). <article-title>A delay learning algorithm based on spike train kernels for spiking neurons</article-title>. <source>Front. Neurosci.</source> <volume>13</volume>, <fpage>252</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2019.00252</pub-id><pub-id pub-id-type="pmid">30971877</pub-id></citation></ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Werbos</surname> <given-names>P. J.</given-names></name></person-group> (<year>1990</year>). <article-title>Backpropagation through time: what it does and how to do it</article-title>. <source>Proc. IEEE</source> <volume>78</volume>, <fpage>1550</fpage>&#x02013;<lpage>1560</lpage>.</citation></ref>
<ref id="B31">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Chua</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name></person-group> (<year>2018a</year>). <article-title>&#x0201C;A biologically plausible speech recognition framework based on spiking neural networks,&#x0201D;</article-title> in <source>2018 International Joint Conference on Neural Networks (IJCNN)</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>8</lpage>.</citation></ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Chua</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Li</surname> <given-names>G.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name> <name><surname>Tan</surname> <given-names>K. C.</given-names></name></person-group> (<year>2021</year>). <article-title>A tandem learning rule for effective training and rapid inference of deep spiking neural networks</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst</source>.<pub-id pub-id-type="pmid">34288879</pub-id></citation></ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Chua</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name> <name><surname>Tan</surname> <given-names>K. C.</given-names></name></person-group> (<year>2018b</year>). <article-title>A spiking neural network framework for robust sound classification</article-title>. <source>Front. Neurosci.</source> <volume>12</volume>, <fpage>836</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2018.00836</pub-id><pub-id pub-id-type="pmid">30510500</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Pan</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Das</surname> <given-names>R. K.</given-names></name> <name><surname>Chua</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Robust sound recognition: a neuromorphic approach,&#x0201D;</article-title> in <source>Interspeech</source>, <fpage>3667</fpage>&#x02013;<lpage>3668</lpage>.</citation></ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Y&#x00131;lmaz</surname> <given-names>E.</given-names></name> <name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name> <name><surname>Tan</surname> <given-names>K. C.</given-names></name></person-group> (<year>2020</year>). <article-title>Deep spiking neural networks for large vocabulary automatic speech recognition</article-title>. <source>Front. Neurosci.</source> <volume>14</volume>, <fpage>199</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2020.00199</pub-id><pub-id pub-id-type="pmid">32256308</pub-id></citation></ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Y.</given-names></name> <name><surname>Deng</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>G.</given-names></name> <name><surname>Zhu</surname> <given-names>J.</given-names></name> <name><surname>Shi</surname> <given-names>L.</given-names></name></person-group> (<year>2018c</year>). <article-title>Spatio-temporal backpropagation for training high-performance spiking neural networks</article-title>. <source>Front. Neurosci.</source> <volume>12</volume>, <fpage>331</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2018.00331</pub-id><pub-id pub-id-type="pmid">29875621</pub-id></citation></ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Q.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Fang</surname> <given-names>X.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>J. K.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2023a</year>). <article-title>Biologically inspired structure learning with reverse knowledge distillation for spiking neural networks</article-title>. <source>arXiv preprint arXiv:2304.09500</source>.</citation></ref>
<ref id="B38">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Q.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>J. K.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Pan</surname> <given-names>G.</given-names></name></person-group> (<year>2023b</year>). <article-title>&#x0201C;Constructing deep spiking neural networks from artificial neural networks with knowledge distillation,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>7886</fpage>&#x02013;<lpage>7895</lpage>.</citation></ref>
<ref id="B39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Q.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>P.</given-names></name> <name><surname>Liu</surname> <given-names>J. K.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Hierarchical spiking-based model for efficient image classification with enhanced feature extraction and encoding</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst</source>. <pub-id pub-id-type="doi">10.1109/TNNLS.2022.3232106</pub-id><pub-id pub-id-type="pmid">37015639</pub-id></citation></ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Q.</given-names></name> <name><surname>Qi</surname> <given-names>Y.</given-names></name> <name><surname>Yu</surname> <given-names>H.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Pan</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>&#x0201C;CSNN: an augmented spiking based framework with perceptron-inception,&#x0201D;</article-title> in <source>IJCAI</source>, 1646.</citation></ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Q.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Ran</surname> <given-names>X.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Pan</surname> <given-names>G.</given-names></name> <name><surname>Liu</surname> <given-names>J. K.</given-names></name></person-group> (<year>2021</year>). <article-title>Robust transcoding sensory information with neural spikes</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>33</volume>, <fpage>1935</fpage>&#x02013;<lpage>1946</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2021.3107449</pub-id><pub-id pub-id-type="pmid">34665741</pub-id></citation></ref>
<ref id="B42">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>M.</given-names></name> <name><surname>Gao</surname> <given-names>H.</given-names></name> <name><surname>Zhao</surname> <given-names>G.</given-names></name> <name><surname>Wang</surname> <given-names>D.</given-names></name> <name><surname>Lin</surname> <given-names>Y.</given-names></name> <name><surname>Yang</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Temporal-wise attention spiking neural networks for event streams classification,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>10221</fpage>&#x02013;<lpage>10230</lpage>.</citation></ref>
<ref id="B43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Y&#x00131;lmaz</surname> <given-names>E.</given-names></name> <name><surname>Gevrek</surname> <given-names>O. B.</given-names></name> <name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Meng</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Deep convolutional spiking neural networks for keyword spotting,&#x0201D;</article-title> in <source>Proceedings of Interspeech</source>, <fpage>2557</fpage>&#x02013;<lpage>2561</lpage>.</citation></ref>
<ref id="B44">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yin</surname> <given-names>B.</given-names></name> <name><surname>Corradi</surname> <given-names>F.</given-names></name> <name><surname>Boht&#x000E9;</surname> <given-names>S. M.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Effective and efficient computation with multiple-timescale spiking recurrent neural networks,&#x0201D;</article-title> in <source>International Conference on Neuromorphic Systems 2020</source>, <fpage>1</fpage>&#x02013;<lpage>8</lpage>.</citation></ref>
<ref id="B45">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yin</surname> <given-names>B.</given-names></name> <name><surname>Corradi</surname> <given-names>F.</given-names></name> <name><surname>Boht&#x000E9;</surname> <given-names>S. M.</given-names></name></person-group> (<year>2021</year>). <article-title>Accurate and efficient time-domain classification with adaptive spiking recurrent neural networks</article-title>. <source>Nat. Mach. Intell.</source> <volume>3</volume>, <fpage>905</fpage>&#x02013;<lpage>913</lpage>. <pub-id pub-id-type="doi">10.1038/s42256-021-00397-w</pub-id></citation></ref>
<ref id="B46">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>Q.</given-names></name> <name><surname>Ma</surname> <given-names>C.</given-names></name> <name><surname>Song</surname> <given-names>S.</given-names></name> <name><surname>Zhang</surname> <given-names>G.</given-names></name> <name><surname>Dang</surname> <given-names>J.</given-names></name> <name><surname>Tan</surname> <given-names>K. C.</given-names></name></person-group> (<year>2022</year>). <article-title>Constructing accurate and efficient deep spiking neural networks with double-threshold and augmented schemes</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>33</volume>, <fpage>1714</fpage>&#x02013;<lpage>1726</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2020.3043415</pub-id><pub-id pub-id-type="pmid">33471769</pub-id></citation></ref>
<ref id="B47">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Belatreche</surname> <given-names>A.</given-names></name> <name><surname>Amornpaisannon</surname> <given-names>B.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Rectified linear postsynaptic potential function for backpropagation in deep spiking neural networks</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst</source>. <volume>33</volume>, <fpage>1947</fpage>&#x02013;<lpage>1958</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2021.3110991</pub-id><pub-id pub-id-type="pmid">34534091</pub-id></citation></ref>
<ref id="B48">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Belatreche</surname> <given-names>A.</given-names></name> <name><surname>Pan</surname> <given-names>Z.</given-names></name> <name><surname>Xie</surname> <given-names>X.</given-names></name> <name><surname>Chua</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Supervised learning in spiking neural networks with synaptic delay-weight plasticity</article-title>. <source>Neurocomputing</source> <volume>409</volume>, <fpage>103</fpage>&#x02013;<lpage>118</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2020.03.079</pub-id></citation></ref>
<ref id="B49">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Chua</surname> <given-names>Y.</given-names></name> <name><surname>Luo</surname> <given-names>X.</given-names></name> <name><surname>Pan</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>&#x0201C;MPD-AL: an efficient membrane potential driven aggregate-label learning algorithm for spiking neurons,&#x0201D;</article-title> in <source>Proceedings of the AAAI Conference on Artificial Intelligence</source>.</citation></ref>
<ref id="B50">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>W.</given-names></name> <name><surname>Li</surname> <given-names>P.</given-names></name></person-group> (<year>2019</year>). <article-title>Spike-train level backpropagation for training deep recurrent spiking neural networks</article-title>. <source>arXiv preprint arXiv:1908.06378</source>.</citation></ref>
<ref id="B51">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>W.</given-names></name> <name><surname>Li</surname> <given-names>P.</given-names></name></person-group> (<year>2021</year>). <article-title>Skip-connected self-recurrent spiking neural networks with joint intrinsic parameter and synaptic weight training</article-title>. <source>Neural Comput.</source> <volume>33</volume>, <fpage>1886</fpage>&#x02013;<lpage>1913</lpage>. <pub-id pub-id-type="doi">10.1162/neco_a_01393</pub-id><pub-id pub-id-type="pmid">34411267</pub-id></citation></ref>
</ref-list>
</back>
</article>