<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurosci.</journal-id>
<journal-title>Frontiers in Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurosci.</abbrev-journal-title>
<issn pub-type="epub">1662-453X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnins.2023.1261543</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>STCA-SNN: self-attention-based temporal-channel joint attention for spiking neural networks</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Wu</surname>
<given-names>Xiyan</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/2379700/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology"/>
<role content-type="https://credit.niso.org/contributor-roles/software"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Song</surname>
<given-names>Yong</given-names>
</name>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1930867/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhou</surname>
<given-names>Ya</given-names>
</name>
<xref ref-type="corresp" rid="c002"><sup>&#x002A;</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/supervision"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Jiang</surname>
<given-names>Yurong</given-names>
</name>
<role content-type="https://credit.niso.org/contributor-roles/supervision"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Bai</surname>
<given-names>Yashuo</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/2563317/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis"/>
<role content-type="https://credit.niso.org/contributor-roles/validation"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Xinyi</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/2561841/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis"/>
<role content-type="https://credit.niso.org/contributor-roles/software"/>
<role content-type="https://credit.niso.org/contributor-roles/validation"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yang</surname>
<given-names>Xin</given-names>
</name>
<uri xlink:href="https://loop.frontiersin.org/people/2562003/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing"/>
</contrib>
</contrib-group>
<aff><institution>School of Optics and Photonics, Beijing Institute of Technology</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0001">
<p>Edited by: Lei Deng, Tsinghua University, China</p>
</fn>
<fn fn-type="edited-by" id="fn0002">
<p>Reviewed by: Malu Zhang, National University of Singapore, Singapore; Man Yao, Xi'an Jiaotong University, China; Fangwen Yu, Tsinghua University, China</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Yong Song, <email>yongsong@bit.edu.cn</email></corresp>
<corresp id="c002">Ya Zhou, <email>zhouya@bit.edu.cn</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>10</day>
<month>11</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>17</volume>
<elocation-id>1261543</elocation-id>
<history>
<date date-type="received">
<day>20</day>
<month>07</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>23</day>
<month>10</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2023 Wu, Song, Zhou, Jiang, Bai, Li and Yang.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Wu, Song, Zhou, Jiang, Bai, Li and Yang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Spiking Neural Networks (SNNs) have shown great promise in processing spatio-temporal information compared to Artificial Neural Networks (ANNs). However, there remains a performance gap between SNNs and ANNs, which impedes the practical application of SNNs. With intrinsic event-triggered property and temporal dynamics, SNNs have the potential to effectively extract spatio-temporal features from event streams. To leverage the temporal potential of SNNs, we propose a self-attention-based temporal-channel joint attention SNN (STCA-SNN) with end-to-end training, which infers attention weights along both temporal and channel dimensions concurrently. It models global temporal and channel information correlations with self-attention, enabling the network to learn &#x2018;what&#x2019; and &#x2018;when&#x2019; to attend simultaneously. Our experimental results show that STCA-SNNs achieve better performance on N-MNIST (99.67%), CIFAR10-DVS (81.6%), and N-Caltech 101 (80.88%) compared with the state-of-the-art SNNs. Meanwhile, our ablation study demonstrates that STCA-SNNs improve the accuracy of event stream classification tasks.</p>
</abstract>
<kwd-group>
<kwd>spiking neural networks</kwd>
<kwd>self-attention</kwd>
<kwd>temporal-channel</kwd>
<kwd>neuromorphic computing</kwd>
<kwd>event streams</kwd>
</kwd-group>
<counts>
<fig-count count="4"/>
<table-count count="4"/>
<equation-count count="13"/>
<ref-count count="61"/>
<page-count count="10"/>
<word-count count="6931"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Neuromorphic Engineering</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1.</label>
<title>Introduction</title>
<p>As the representatives of mimicking the human brain at the neuronal level, Spiking Neural Networks (SNNs) have gained great attraction for the high biological plausibility, event-driven property, and high energy efficiency (<xref ref-type="bibr" rid="ref39">Rieke et al., 1999</xref>; <xref ref-type="bibr" rid="ref15">Gerstner et al., 2014</xref>; <xref ref-type="bibr" rid="ref3">Bellec et al., 2018</xref>). Using time as an additional input dimension, SNNs record valuable information in a sparse manner and deliver information through spikes only when the membrane potential reaches the firing threshold (<xref ref-type="bibr" rid="ref30">Mainen and Sejnowski, 1995</xref>). Inspired by biological visual processing mechanisms, Dynamic Vision Sensors (DVS) encode the time, location, and polarity of the brightness changes per pixel into event streams (<xref ref-type="bibr" rid="ref28">Lichtsteiner et al., 2008</xref>; <xref ref-type="bibr" rid="ref34">Posch et al., 2010</xref>). With its unique advantages of high event rate, high dynamic range, and fewer resource requirements (<xref ref-type="bibr" rid="ref14">Gallego et al., 2020</xref>), DVS has broad application prospects in various visual tasks, such as autonomous driving (<xref ref-type="bibr" rid="ref8">Cheng et al., 2019</xref>), high-speed object tracking (<xref ref-type="bibr" rid="ref37">Rebecq et al., 2019</xref>), optical flow estimation (<xref ref-type="bibr" rid="ref38">Ridwan and Cheng, 2017</xref>), and action recognition (<xref ref-type="bibr" rid="ref1">Amir et al., 2017</xref>). Event-based vision is one of the typical advantage application scenarios of SNNs, providing a platform for demonstrating the capabilities of spiking neurons to process information with spatio-temporal dynamics.</p>
<p>Although the intrinsic time-dependent neuron dynamics endows SNNs with the ability to process spatio-temporal information, there remains a performance gap between SNNs and ANNs. Recently, ANNs&#x2019; modules (<xref ref-type="bibr" rid="ref19">Hu et al., 2021</xref>; <xref ref-type="bibr" rid="ref51">Yang et al., 2021</xref>; <xref ref-type="bibr" rid="ref52">Yao et al., 2021</xref>, <xref ref-type="bibr" rid="ref55">2023c</xref>) have been integrated into SNNs to improve the performance of SNNs. CSNN (<xref ref-type="bibr" rid="ref50">Xu et al., 2018</xref>) first validated the application of convolution structure on SNNs, promoting the development of SNNs. Convolution-based SNNs share weights across both temporal and spatial dimensions, following the assumption of spatio-temporal invariance (<xref ref-type="bibr" rid="ref20">Huang et al., 2022</xref>). This approach can be regarded as a local way of information extraction since convolutional operations can only process a local neighborhood at a time, either in space or time. However, when dealing with sequential data like event streams, capturing long-distance dependencies is of central importance to modeling complex temporal dynamics. Non-local operations (<xref ref-type="bibr" rid="ref45">Wang et al., 2018</xref>) provided a solution as a building block by computing the response at a position as a weighted sum of the features at all positions. The range of positions can span across space, time, or spacetime, allowing non-local operators to achieve remarkable success in vision attention.</p>
<p>The attention mechanism is inspired by the human ability to selectively find prominent areas in complex scenes (<xref ref-type="bibr" rid="ref21">Itti et al., 1998</xref>). A popular research direction is to present attention as a lightweight auxiliary unit to improve the representation power of the basic model. In the ANNs domain, <xref ref-type="bibr" rid="ref2">Ba et al. (2014)</xref> first introduced the term &#x201C;visual attention&#x201D; for image classification tasks, utilizing attention to identify relevant regions and locations within the input image. This approach also reduces the computational complexity of the proposed model regarding the size of the input image. SENet (<xref ref-type="bibr" rid="ref18">Hu et al., 2018</xref>) was introduced to reweight the channel-wise responses of the convolutional features, determining &#x201C;what&#x201D; to pay attention to. CBAM (<xref ref-type="bibr" rid="ref46">Woo et al., 2018</xref>) inferred attention maps sequentially along channel-wise and spatial dimensions for refining the input feature, determining &#x201C;what&#x201D; and &#x201C;where&#x201D; to pay attention to concurrently. In the SNNs domain, TA-SNN (<xref ref-type="bibr" rid="ref52">Yao et al., 2021</xref>) first extended the channel-wise attention concept to temporal-wise attention and integrated it into SNNs to determine &#x2018;when&#x2019; to pay attention. MA-SNN (<xref ref-type="bibr" rid="ref55">Yao et al., 2023c</xref>) extended CBAM to SNNs and proposed a multi-dimensional attention module along temporal-wise, channel-wise, and spatial-wise separately or simultaneously. Recently, TCJA-SNN (<xref ref-type="bibr" rid="ref62">Zhu et al., 2022</xref>) cooperated temporal-wise and channel-wise attention correlations using the 1-D convolution operation to present the correlation between time-steps and channels. However, the receptive field of TCJA-SNN is a local cross shape that is restricted by its convolution kernels, shown in <xref ref-type="fig" rid="fig1">Figure 1A</xref>. Thus long-range dependencies can only be captured when 1-D convolution operation is repeated, which makes multi-hop dependency modeling difficult. On the other hand, self-attention, another vital feature of the human biological system, possesses the ability to capture feature dependencies effectively as an additional non-local operator alongside SE and CBAM. It has sparked a significant wave of interest and achieved remarkable success in various tasks (<xref ref-type="bibr" rid="ref44">Vaswani et al., 2017</xref>; <xref ref-type="bibr" rid="ref11">Dosovitskiy et al., 2020</xref>; <xref ref-type="bibr" rid="ref29">Liu et al., 2021</xref>). Intuitively, there is a compelling interest in investigating the application of self-attention in SNNs to advance deep learning, when considering the biological characteristics of both mechanisms (<xref ref-type="bibr" rid="ref53">Yao et al., 2023a</xref>,<xref ref-type="bibr" rid="ref54">b</xref>; <xref ref-type="bibr" rid="ref60">Zhou C. et al., 2023</xref>; <xref ref-type="bibr" rid="ref61">Zhou Z. et al., 2023</xref>).</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Illustration of receptive fields on channel and temporal domains. <italic>T</italic> means the temporal domain, <italic>C</italic> means the channel domain, and <italic>H</italic>, <italic>W</italic> represent the spatial domain. <bold>(A)</bold> TCJA-SNN utilizes two local attention mechanisms with 1-D convolution along temporal-wise and channel-wise, respectively, then fuse them, forming a cross-shaped receptive field. <bold>(B)</bold> STCA-SNN uses self-attention operation to establish temporal-wise and channel-wise correlations, forming a global receptive field.</p>
</caption>
<graphic xlink:href="fnins-17-1261543-g001.tif"/>
</fig>
<p>To address the local spatio-temporal receptive field limitation of TCJA, we first adopt self-attention, a non-local operation, to model global temporal and channel information correlations. The self-attention module we employed can capture the global spatio-temporal receptive field, as shown in <xref ref-type="fig" rid="fig1">Figure 1B</xref>, allowing for the direct long-range dependencies modeling, which is the highlight of our work. We propose a plug-and-play Self-attention-based Temporal-Channel joint Attention (STCA) module for SNNs with end-to-end training. The STCA-SNNs can learn to focus on different features of the input at each time-step. In other words, the STCA-SNNs can learn &#x2018;when&#x2019; and &#x2018;what&#x2019; to attend concurrently, enhancing the ability of the SNNs to process temporal information. We evaluated the effectiveness of STCA-SNNs across different architectures on three benchmark event stream classification datasets: N-MNIST, CIFAR10-DVS, and N-Caltech 101. Our detailed experiments show that STCA-SNNs achieve competitive accuracy with existing state-of-the-art SNNs.</p>
<p>The main contributions of our work are summarized as follows:</p>
<list list-type="order">
<list-item>
<p>We propose STCA-SNNs for event streams that can undertake end-to-end training and inference tasks.</p>
</list-item>
<list-item>
<p>The plug-and-play STCA module models global temporal and channel correlations with self-attention, allowing the network to learn &#x2018;when&#x2019; and &#x2018;what&#x2019; to attend simultaneously. This enhances the ability of SNNs to process temporal information.</p>
</list-item>
<list-item>
<p>We evaluate the performance of STCA-SNNs on three benchmark event stream classification datasets, N-MNIST, CIFAR10DVS, and N-Caltech 101. Our experimental results demonstrate that STCA-SNNs achieve competitive accuracy compared to existing state-of-the-art SNNs.</p>
</list-item>
</list>
</sec>
<sec id="sec2">
<label>2.</label>
<title>Related work</title>
<sec id="sec3">
<label>2.1.</label>
<title>Attention in SNNs</title>
<p>Spiking neural networks benefit from biological plausibility and continuously pursue the combination with brain mechanisms. The attention mechanism draws inspiration from the human ability to selectively identify salient regions within complex scenes and has gained remarkable success in deep learning by allocating attention weights preferentially to the most informative input components. A popular research direction is to present attention as an auxiliary module that can be easily integrated with existing architectures to boost the representation power of the basic model (<xref ref-type="bibr" rid="ref18">Hu et al., 2018</xref>; <xref ref-type="bibr" rid="ref46">Woo et al., 2018</xref>; <xref ref-type="bibr" rid="ref16">Guo et al., 2022</xref>; <xref ref-type="bibr" rid="ref26">Li et al., 2022</xref>). <xref ref-type="bibr" rid="ref52">Yao et al. (2021)</xref> first suggested using an extra plug-and-play temporal-wise attention module for SNNs to bypass a few unnecessary input timesteps. Then they proposed a multi-dimensional attention module along temporal-wise, channel-wise, and spatial-wise separately or simultaneously to optimize membrane potentials, which in turn regulate the spiking response (<xref ref-type="bibr" rid="ref55">Yao et al., 2023c</xref>). STSC-SNN (<xref ref-type="bibr" rid="ref56">Yu et al., 2022</xref>) employed temporal convolution and attention mechanisms to improve spatio-temporal receptive fields of synaptic connections. SCTFA-SNN (<xref ref-type="bibr" rid="ref6">Cai et al., 2023</xref>) computed channel-wise and spatial-wise attention separately to optimize membrane potentials along the temporal dimension. <xref ref-type="bibr" rid="ref53">Yao et al. (2023a</xref>,<xref ref-type="bibr" rid="ref54">b)</xref> recently proposed an advanced spatial attention module to harness SNNs&#x2019; redundancy, which can adaptively optimize their membrane potential distribution by a pair of individual spatial attention sub-modules. TCJA-SNN (<xref ref-type="bibr" rid="ref62">Zhu et al., 2022</xref>) cooperated temporal-wise joint channel-wise attention correlations using 1-D convolution operation. However, the temporal-channel receptive field of TCJA is a local cross shape that is restricted by its convolution kernels, requiring multiple repeated computations to establish long-range dependencies of features. Therefore, it is computationally inefficient and makes multi-hop dependency modeling difficult.</p>
<p>Among the attention mechanisms, self-attention, as another important feature of the human biological system, possesses the ability to capture feature dependencies. Originally developed for natural language processing (<xref ref-type="bibr" rid="ref44">Vaswani et al., 2017</xref>), self-attention has been extended to computer vision, where it has achieved significant success in various applications. The self-attention module can also be considered a building block of CNN architectures, which are known for their limited scalability when it comes to large receptive fields (<xref ref-type="bibr" rid="ref17">Han et al., 2022</xref>). In contrast to the progressive behavior of convolution operation, self-attention can capture long-range dependencies directly by computing interactions between any two positions, regardless of their positional distance. Moreover, it is commonly integrated into the top of the networks to enhance high-level semantic features for vision tasks. Recently, an emerging research direction is to explore the biological characteristics associated with the fusion of self-attention and SNNs (<xref ref-type="bibr" rid="ref53">Yao et al., 2023a</xref>,<xref ref-type="bibr" rid="ref54">b</xref>; <xref ref-type="bibr" rid="ref59">Zhou C. et al., 2023</xref>; <xref ref-type="bibr" rid="ref60">Zhou Z. et al., 2023</xref>). These efforts primarily revolve around optimizing the computation of self-attention within SNNs by circumventing multiplicative operations, leading to performance degradation. Diverging from these studies, our primary goal is to explore how self-attention can enhance the spatio-temporal information processing capabilities of SNNs.</p>
</sec>
<sec id="sec4">
<label>2.2.</label>
<title>Learning algorithms for SNNs</title>
<p>Existing SNN training methods can be roughly divided into three categories: 1) the biologically plausible method, 2) the conversion method, and 3) the gradient-based direct training method. The first one is based on biological plausible local learning rules, like spike timing dependent plasticity (STDP) (<xref ref-type="bibr" rid="ref9">Diehl and Cook, 2015</xref>; <xref ref-type="bibr" rid="ref22">Kheradpisheh et al., 2018</xref>) and ReSuMe (<xref ref-type="bibr" rid="ref33">Ponulak and Kasinski, 2010</xref>), but achieving high performance for deep networks is challenging. The conversion method offers an alternative way to obtain high-performance SNNs by converting a well-trained ANN and mapping its parameters to an SNN with an equivalent architecture, where the firing rate of the SNN acts as ReLU activation (<xref ref-type="bibr" rid="ref7">Cao et al., 2015</xref>; <xref ref-type="bibr" rid="ref41">Rueckauer et al., 2017</xref>; <xref ref-type="bibr" rid="ref43">Sengupta et al., 2019</xref>; <xref ref-type="bibr" rid="ref10">Ding et al., 2021</xref>; <xref ref-type="bibr" rid="ref4">Bu et al., 2022</xref>; <xref ref-type="bibr" rid="ref49">Wu et al., 2023</xref>). Moreover, some works explored post-conversion fine-tuning of converted SNNs to reduce latency and increase accuracy (<xref ref-type="bibr" rid="ref36">Rathi et al., 2020</xref>; <xref ref-type="bibr" rid="ref35">Rathi and Roy, 2021</xref>; <xref ref-type="bibr" rid="ref48">Wu et al., 2021</xref>). However, this method is not suitable for neuromorphic datasets. The gradient-based direct training methods primarily include voltage gradient-based (<xref ref-type="bibr" rid="ref58">Zhang et al., 2020</xref>), timing gradient-based (<xref ref-type="bibr" rid="ref59">Zhang et al., 2021</xref>), and activation gradient-based approaches. Among them, the activation gradient-based method demonstrates notable effectiveness when performing challenging tasks. This approach uses surrogate gradients to address the non-differentiable spike activity issue, allowing for error back-propagation through time (BPTT) to interface with gradient descent directly on SNNs for end-to-end training (<xref ref-type="bibr" rid="ref31">Neftci et al., 2019</xref>; <xref ref-type="bibr" rid="ref47">Wu et al., 2019</xref>; <xref ref-type="bibr" rid="ref51">Yang et al., 2021</xref>; <xref ref-type="bibr" rid="ref57">Zenke and Vogels, 2021</xref>). These efforts have shown strong potential in achieving high performance by exploiting spatio-temporal information. However, further research is required to determine how to make better use of spatio-temporal data and how to efficiently extract spatio-temporal features. This is what we want to contribute.</p>
</sec>
</sec>
<sec sec-type="materials|methods" id="sec5">
<label>3.</label>
<title>Materials and methods</title>
<p>In this section, we first present the representation of event streams and the adopted spiking neuron model and later propose our STCA module based on this neuron model. Finally, we introduce the training method adopted in this paper.</p>
<sec id="sec6">
<label>3.1.</label>
<title>Representation of event streams</title>
<p>An event, e, encodes three pieces of information: the pixel location (<italic>x</italic>, <italic>y</italic>) of the event, the timestamp <italic>t</italic>&#x2032; recording the time when the event is triggered, and the polarity of each single event <italic>p</italic> &#x2208; {&#x2212;1, +1} reflecting an increase or decrease of brightness via +1/&#x2212;1. Formally, a set of events at the timestamp <italic>t</italic>&#x2032;can be defined as:</p>
<disp-formula id="EQ1">
<label>(1)</label>
<mml:math id="M1">
<mml:msub>
<mml:mi>E</mml:mi>
<mml:msup>
<mml:mi>t</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msubsup>
<mml:mfenced open="{" close="}">
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mspace width="0.5em"/>
<mml:msup>
<mml:mi>t</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mfenced>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
<mml:mi>N</mml:mi>
</mml:msubsup>
</mml:math>
</disp-formula>
<p>Assume the spatial resolution is <italic>h</italic>&#x2009;&#x00D7;&#x2009;<italic>w</italic>, the event set equals to the spike pattern tensor <italic>S<sub>t&#x2032;</sub></italic>&#x2208;R<sup>2&#x2009;&#x00D7;&#x2009;h&#x2009;&#x00D7;&#x2009;w</sup> at the timestamp <italic>t</italic>&#x2032;. However, processing these events one by one can be inefficient due to the limited amount of information contained in a single event. We follow the frame-based representation in SpikingJelly (<xref ref-type="bibr" rid="ref12">Fang et al., 2020</xref>) that transforms event streams into high-rate frame sequences during preprocessing. Each frame includes many blank (zero) areas, and SNNs can skip the computation of the zero areas in each input frame (<xref ref-type="bibr" rid="ref40">Roy et al., 2019</xref>), improving overall efficiency.</p>
</sec>
<sec id="sec7">
<label>3.2.</label>
<title>Spiking neural models</title>
<p>Spiking neuron in SNNs integrates synaptic inputs from the previous layer and the residual membrane potential into the latest membrane potential. The Parametric Leaky integrate-and-fire (PLIF) model can learn the synaptic weight and membrane time constant simultaneously, which can enhance the learning capabilities of SNNs (<xref ref-type="bibr" rid="ref13">Fang et al., 2021</xref>). The subthreshold dynamics of the PLIF neuron is defined as:</p>
<disp-formula id="EQ2">
<label>(2)</label>
<mml:math id="M2">
<mml:mi>&#x03C4;</mml:mi>
<mml:mfrac>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>V</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>t</mml:mi>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>=</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>t</mml:mi>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi mathvariant="italic">rest</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>+</mml:mo>
<mml:mi>X</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>t</mml:mi>
</mml:mfenced>
</mml:math>
</disp-formula>
<p>where <italic>V</italic> (<italic>t</italic>) indicates the membrane potential of the neuron at time <italic>t</italic>, <italic>&#x03C4;</italic> is the membrane time constant that controls the decay of <italic>V</italic> (<italic>t</italic>), <italic>X</italic> (<italic>t</italic>) is the input collected from the presynaptic neurons and <italic>V<sub>rest</sub></italic> is the resting potential. When the membrane potential <italic>V</italic> (<italic>t</italic>) exceeds the neuron threshold at time <italic>t</italic>, the neuron will emit a spike, and then the membrane potential goes back to a reset value <italic>V<sub>rest</sub></italic>. We set <italic>V<sub>rest</sub></italic>&#x2009;=&#x2009;<italic>V<sub>reset</sub></italic>&#x2009;=&#x2009;0. The iterative representation of the PLIF model can be described as follows:</p>
<disp-formula id="EQ3">
<label>(3)</label>
<mml:math id="M3">
<mml:mo stretchy="true">{</mml:mo>
<mml:mtable columnalign="center">
<mml:mtr columnalign="center">
<mml:mtd columnalign="center">
<mml:msup>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:mfrac>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mi>&#x03C4;</mml:mi>
</mml:mfrac>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi mathvariant="italic">reset</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>+</mml:mo>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr columnalign="center">
<mml:mtd columnalign="center">
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi>&#x0398;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr columnalign="center">
<mml:mtd columnalign="center">
<mml:msup>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:msup>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mi mathvariant="normal">e</mml:mi>
<mml:mi mathvariant="italic">set</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:math>
</disp-formula>
<p>where superscripts <italic>t</italic> and <italic>l</italic> indicate the time step and layer index. To avoid confusion, we use <italic>H<sup>t,l</sup></italic> and <italic>V<sup>t,l</sup></italic> to represent the membrane potential after neuronal dynamics and after the trigger of a spike in layer <italic>l</italic> at time-step <italic>t</italic>, respectively<italic>. V<sub>th</sub></italic> is the firing threshold. <italic>S<sup>t,l</sup></italic> is determined by <inline-formula>
<mml:math id="M4">
<mml:mi>&#x0398;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mi>x</mml:mi>
</mml:mfenced>
</mml:math>
</inline-formula>, the Heaviside step function that outputs 1 if <italic>x</italic>&#x2009;&#x2265;&#x2009;0 or 0 otherwise. The time constant <italic>&#x03C4;&#x2009;=</italic> 1<italic>/k</italic>(<italic>a</italic>), <italic>k</italic>(<italic>a</italic>) is a sigmoid function 1/(1&#x2009;+&#x2009;exp(&#x2212;<italic>a</italic>)) with a trainable parameter <italic>a</italic>.</p>
</sec>
<sec id="sec8">
<label>3.3.</label>
<title>Self-attention-based temporal-channel joint attention module</title>
<p>The processing of temporal information in SNNs is generally attributed to spiking neurons because their dynamics naturally depend on the temporal dimension. However, the LIF neuron and its variants including the PLIF neuron, only sustain very weak temporal linkages. Additionally, event streams are inherently time-dependent therefore, it is necessary to establish spatial&#x2013;temporal correlations to improve data utilization. The focus of this work is to model temporal-wise and channel-wise attention correlations globally by adopting a self-attention mechanism. We present our idea of attention with a pluggable module termed the Self-attention-based Temporal-Channel joint Attention (STCA), which is depicted in <xref ref-type="fig" rid="fig2">Figure 2</xref>.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Diagram of the STCA module. The STCA module first aggregates spatial information by average-pooling and max-pooling then merges them and feeds it into a self-attention block to establish the correlations in both temporal and channel dimensions.</p>
</caption>
<graphic xlink:href="fnins-17-1261543-g002.tif"/>
</fig>
<p>Formally, we collect intermediate the spatial feature of <italic>l</italic>-th layer at all time-steps <italic>X<sup>l</sup></italic>&#x2009;=&#x2009;[&#x00B7; &#x00B7; &#x00B7;, <italic>X<sup>t,l</sup></italic>, &#x00B7; &#x00B7; &#x00B7;]&#x2208; R<sup>T&#x2009;&#x00D7;&#x2009;C&#x2009;&#x00D7;&#x2009;H&#x2009;&#x00D7;&#x2009;W</sup> as the input of STCA module, where <italic>T</italic> is time-step, <italic>C</italic> denotes channels, <italic>H</italic> and <italic>W</italic> are height and width of the feature, respectively. The spatial feature <italic>X<sup>t,l</sup></italic> can be extracted from the original input <italic>S<sup>t,l</sup></italic>:</p>
<disp-formula id="EQ4">
<label>(4)</label>
<mml:math id="M5">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="bold">B</mml:mi>
<mml:mi mathvariant="bold">N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold">Conv</mml:mi>
<mml:mfenced open="(" close=")" separators=",">
<mml:msup>
<mml:mi mathvariant="bold">W</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:msup>
<mml:mi mathvariant="bold">S</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:math>
</disp-formula>
<p>where BN (&#x00B7;) and Conv (&#x00B7;) mean the batch normalization and convolutional operation, <italic>W<sup>l</sup></italic> is the weight matrix, <italic>S</italic><sup><italic>t</italic>, <italic>l</italic><italic>-1</italic></sup> (<italic>l</italic>&#x2009;&#x2260;&#x2009;1) is a spike tensor that only contains 0 and 1, and <inline-formula>
<mml:math id="M6">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>. To simplify the notation, bias terms are omitted. BN is a default operation following the Conv, we also omit it in the rest of this paper. Since each spatial feature <italic>X<sup>t,l</sup></italic> in <italic>X<sup>l</sup></italic> is time-dependent, our idea of attention is to utilize the temporal correlation of these features. It is well known that each channel of feature maps corresponds to a specific visual pattern. Our STCA module aims to determine &#x2018;when&#x2019; to attend to &#x2018;what&#x2019; are semantic attributes of the given input. For efficiency, STCA only focuses on temporal and channel modeling, the spatial information of the feature is aggregated by using both avg-pooling and max-pooling operations as follows:</p>
<disp-formula id="EQ5">
<label>(5)</label>
<mml:math id="M7">
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="bold">AvgPool</mml:mi>
<mml:mfenced open="(" close=")">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mfenced>
<mml:mo>+</mml:mo>
<mml:mi mathvariant="bold">MaxPool</mml:mi>
<mml:mfenced open="(" close=")">
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mfenced>
</mml:math>
</disp-formula>
<p>where AvgPool (&#x00B7;) and MaxPool (&#x00B7;) represent the outputs of the avg-pooling and max-pooling layer respectively, <italic>R<sup>l</sup></italic>&#x2208;R<sup>T &#x00D7; C</sup>. The generated different temporal-channel context descriptors, avg-pooled features and max-pooled features, are merged and then fed into a self-attention (SA) block. We follow the convention (<xref ref-type="bibr" rid="ref45">Wang et al., 2018</xref>) to formulate the SA block, where the input feature in layer <italic>l</italic> is <italic>R<sup>l</sup></italic>&#x2208;R<sup>T &#x00D7; C</sup>, and the output feature is generated as:</p>
<disp-formula id="EQ6">
<label>(6)</label>
<mml:math id="M8">
<mml:msubsup>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>l</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mfenced open="(" close=")">
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:munder>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")" separators=",">
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mfenced>
<mml:mi>g</mml:mi>
<mml:mfenced open="(" close=")">
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mfenced>
</mml:math>
</disp-formula>
<p>where <italic>r<sub>i</sub></italic>&#x2208;R <sup>1&#x2009;&#x00D7;&#x2009;C</sup> and <italic>a<sub>i</sub></italic>&#x2208;R<sup>1&#x2009;&#x00D7;&#x2009;C</sup> indicate the <italic>i<sup>th</sup></italic> position of the input feature <italic>R<sup>l</sup></italic> and output feature <italic>A<sup>l</sup></italic>, respectively. Subscript <italic>j</italic> is the index that enumerates all positions along the temporal domain, i.e., <italic>i</italic>, <italic>j</italic>&#x2208;[1,2,&#x2026;, T], and a pairwise function <italic>f</italic> (&#x00B7;) computes a representing relationship between <italic>i</italic> and all <italic>j.</italic> The function <italic>g</italic> (&#x00B7;) computes a representation of the input signal at time-step <italic>j</italic>, and the response is normalized by a factor <italic>C</italic> (<italic>r<sub>i</sub></italic>). We use a simple extension of the Gaussian function to compute the similarity in an embedding space, and the function <italic>f</italic> (&#x00B7;) can be formulated as:</p>
<disp-formula id="EQ7">
<label>(7)</label>
<mml:math id="M9">
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")" separators=",">
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>&#x03B8;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mfenced>
<mml:mo>&#x2205;</mml:mo>
<mml:msup>
<mml:mfenced open="(" close=")">
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mfenced>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
</mml:msup>
</mml:math>
</disp-formula>
<p>where <italic>&#x03B8;</italic> (&#x00B7;) and <italic>&#x03D5;</italic> (&#x00B7;) can be any embedding layers. If we consider the <italic>&#x03B8;</italic> (&#x00B7;), <italic>&#x03D5;</italic> (&#x00B7;), <italic>g</italic> (&#x00B7;) in the form of linear embedding: <italic>&#x03B8;</italic> (<italic>R<sup>l</sup></italic>)&#x2009;=&#x2009;<italic>R<sup>l</sup>W<sub>&#x03B8;</sub></italic>, <italic>&#x03D5;</italic> (<italic>R<sup>l</sup></italic>)&#x2009;=&#x2009;<italic>R<sup>l</sup>W<sub>&#x03D5;</sub></italic>, <italic>g</italic> (<italic>R<sup>l</sup></italic>)&#x2009;=&#x2009;<italic>R<sup>l</sup>W<sub>g</sub></italic>, where <italic>W<sub>&#x03B8;</sub></italic>&#x2208;<inline-formula>
<mml:math id="M10">
<mml:msup>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, <italic>W<sub>&#x03D5;</sub></italic> &#x2208; <inline-formula>
<mml:math id="M11">
<mml:msup>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula><italic>, W<sub>g</sub></italic> &#x2208; <inline-formula>
<mml:math id="M12">
<mml:msup>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, and set the normalization factor as <inline-formula>
<mml:math id="M13">
<mml:mi>C</mml:mi>
<mml:mfenced open="(" close=")">
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:munder>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mo>&#x2200;</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")" separators=",">
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mfenced>
</mml:math>
</inline-formula>, the <xref ref-type="disp-formula" rid="EQ6">Eq. 6</xref> can be rewritten as:</p>
<disp-formula id="EQ8">
<label>(8)</label>
<mml:math id="M14">
<mml:msubsup>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>l</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>&#x03B8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mo>&#x2205;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mi>r</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>T</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:msub>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>&#x03B8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mo>&#x2205;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:msubsup>
<mml:msubsup>
<mml:mi>r</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>T</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</disp-formula>
<p>where <bold><italic>w</italic></bold><italic><sub>&#x03B8;,i</sub></italic>&#x2208;R<sup>C&#x2009;&#x00D7;&#x2009;1</sup> is the <italic>i<sup>th</sup></italic> row of the weight matrix <italic>W<sub>&#x03B8;</sub></italic>. For a given index <italic>i</italic>, <inline-formula>
<mml:math id="M15">
<mml:mfrac>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mfenced open="(" close=")">
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")" separators=",">
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mfenced>
</mml:math>
</inline-formula> becomes the softmax output along the dimension <italic>j.</italic> The formulation can be future rewritten as:</p>
<disp-formula id="EQ9">
<label>(9)</label>
<mml:math id="M16">
<mml:msup>
<mml:mi>A</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="bold">softmax</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>&#x03B8;</mml:mi>
</mml:msub>
<mml:msubsup>
<mml:mi>W</mml:mi>
<mml:mo>&#x2205;</mml:mo>
<mml:mi>T</mml:mi>
</mml:msubsup>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mi>g</mml:mi>
<mml:mfenced open="(" close=")">
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:mfenced>
</mml:math>
</disp-formula>
<p>where <italic>A<sup>l</sup></italic>&#x2208;R<sup>T&#x2009;&#x00D7;&#x2009;C</sup> is the output feature of the same size as <italic>R<sup>l</sup></italic>. Given the query, key, and value representations:</p>
<disp-formula id="EQ10">
<label>(10)</label>
<mml:math id="M17">
<mml:mi>Q</mml:mi>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>Q</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>K</mml:mi>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>V</mml:mi>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>V</mml:mi>
</mml:msup>
</mml:math>
</disp-formula>
<p>Once <inline-formula>
<mml:math id="M18">
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>Q</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>&#x03B8;</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math id="M19">
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>&#x03D5;</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>, <inline-formula>
<mml:math id="M20">
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>V</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>g</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>, <italic>W<sup>Q</sup></italic>&#x2208;R<sup>C&#x2009;&#x00D7;&#x2009;C</sup>, <italic>W<sup>K</sup></italic>&#x2208;R<sup>C&#x2009;&#x00D7;&#x2009;C</sup>, and <italic>W<sup>V</sup></italic>&#x2208;R<sup>C&#x2009;&#x00D7;&#x2009;C</sup>, <xref ref-type="disp-formula" rid="EQ9">Eq. 9</xref> can be formulated as:</p>
<disp-formula id="EQ11">
<label>(11)</label>
<mml:math id="M21">
<mml:msup>
<mml:mi>A</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="italic">softmax</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:msup>
<mml:mi>K</mml:mi>
<mml:mi>T</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mi>V</mml:mi>
</mml:math>
</disp-formula>
<p>In this way, the SA block is constructed. Then we employ a residual connection around the SA block. Finally, the attention process of STCA can be formulated as:</p>
<disp-formula id="EQ12">
<label>(12)</label>
<mml:math id="M22">
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mi mathvariant="italic">STCA</mml:mi>
<mml:mi>l</mml:mi>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:mi>f</mml:mi>
<mml:mo>&#x2299;</mml:mo>
<mml:msup>
<mml:mi>X</mml:mi>
<mml:mi>l</mml:mi>
</mml:msup>
</mml:math>
</disp-formula>
<p>where <italic>f</italic> = <italic>&#x03C3;</italic>(<italic>R<sup>l</sup> +&#x2009;A<sup>l</sup></italic>) &#x2208; <italic>R</italic><sup>T&#x2009;&#x00D7;&#x2009;C</sup> is the weight vector of STCA, &#x2299; is element-wise multiplication, <italic>&#x03C3;</italic> is the sigmoid function, and <italic>X<sup>l</sup><sub>STCA</sub></italic>&#x2208;R<sup>T&#x2009;&#x00D7;&#x2009;C&#x2009;&#x00D7;&#x2009;H&#x2009;&#x00D7;&#x2009;W</sup> denotes the feature extracted by the STCA module along temporal and channel dimensions.</p>
</sec>
<sec id="sec9">
<label>3.4.</label>
<title>Training</title>
<p>We integrate the STCA module into networks and utilize the BPTT method to train SNNs. Since the process of neuron firing is non-differentiable, we use the derived ATan surrogate function <inline-formula>
<mml:math id="M23">
<mml:msup>
<mml:mi>&#x03C3;</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mfenced open="(" close=")">
<mml:mi>x</mml:mi>
</mml:mfenced>
<mml:mo>=</mml:mo>
<mml:mi>&#x03B1;</mml:mi>
<mml:mo stretchy="true">/</mml:mo>
<mml:mn>2</mml:mn>
<mml:msup>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>+</mml:mo>
<mml:mi>&#x03C0;</mml:mi>
<mml:mi>&#x03B1;</mml:mi>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">/</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mtext>.</mml:mtext>
</mml:math>
</inline-formula>&#x2009;For a given input with label <italic>n</italic>, the neuron that represents class <italic>n</italic> has the highest excitatory level while other neurons remain silent. So the target output is defined by <italic>Y</italic>&#x2009;=&#x2009;[<italic>y<sup>t, i</sup></italic>] with <italic>y<sup>t, i</sup></italic>&#x2009;=&#x2009;1 for <italic>i</italic>&#x2009;=&#x2009;<italic>n</italic>, and <italic>y<sup>t, i</sup></italic>&#x2009;=&#x2009;0 for <italic>i</italic>&#x2009;&#x2260;&#x2009;<italic>n</italic>. Then the loss function is described by the spike mean squared error:</p>
<disp-formula id="EQ13">
<label>(13)</label>
<mml:math id="M24">
<mml:mi>L</mml:mi>
<mml:mo>=</mml:mo>
<mml:msup>
<mml:mfenced open="&#x2016;" close="&#x2016;">
<mml:mrow>
<mml:msup>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mi>T</mml:mi>
</mml:mfrac>
<mml:munderover>
<mml:mstyle displaystyle="true">
<mml:mo stretchy="true">&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:munderover>
<mml:msup>
<mml:mi>o</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:math>
</disp-formula>
<p>where <italic>O&#x2009;=</italic> [<italic>o<sup>t, i</sup></italic>] is the average spiking events of neurons under the voting strategy.</p>
</sec>
</sec>
<sec id="sec10">
<label>4.</label>
<title>Experiments</title>
<sec id="sec11">
<label>4.1.</label>
<title>Experimental setup</title>
<sec id="sec12">
<label>4.1.1.</label>
<title>Implementation details</title>
<p>We implement our experiments with the Pytorch package and SpikingJelly framework. All experiments were conducted using the BPTT learning algorithm on 4 NVIDIA RTX 2080 Ti GPUs. We utilized the Adam optimizer (<xref ref-type="bibr" rid="ref24">Kingma and Ba, 2015</xref>) to accelerate the training process and implemented some standard training techniques of deep learning such as batch normalization and dropout. The corresponding hyper-parameters and SNN hyper-parameters are shown in <xref ref-type="table" rid="tab1">Table 1</xref>. We verify our method on the following DVS benchmarks:</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Hyper-parameter setting.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Hyperparameter</th>
<th align="center" valign="top">N-MNIST</th>
<th align="center" valign="top">CIFAR10-DVS</th>
<th align="center" valign="top">N-Caltech 101</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Max Epoch</td>
<td align="center" valign="middle">500</td>
<td align="center" valign="top">1,000</td>
<td align="center" valign="middle">500</td>
</tr>
<tr>
<td align="left" valign="middle">Automatic mixed precision</td>
<td align="center" valign="middle">x</td>
<td align="center" valign="middle">x</td>
<td align="center" valign="middle">&#x2713;</td>
</tr>
<tr>
<td align="left" valign="middle">Batch size</td>
<td align="center" valign="middle">64</td>
<td align="center" valign="top">32</td>
<td align="center" valign="middle">8</td>
</tr>
<tr>
<td align="left" valign="middle">Learning rate</td>
<td align="center" valign="middle">1e-3</td>
<td align="center" valign="top">1e-3</td>
<td align="center" valign="middle">1e-3</td>
</tr>
<tr>
<td align="left" valign="middle">Time step</td>
<td align="center" valign="middle">10</td>
<td align="center" valign="top">10</td>
<td align="center" valign="middle">14</td>
</tr>
<tr>
<td align="left" valign="middle"><italic>V<sub>th</sub></italic>
</td>
<td align="center" valign="middle">1.0</td>
<td align="center" valign="top">1.0</td>
<td align="center" valign="middle">1.0</td>
</tr>
<tr>
<td align="left" valign="middle"><italic>&#x03C4;<sub>0</sub></italic>
</td>
<td align="center" valign="middle">2.0</td>
<td align="center" valign="top">2.0</td>
<td align="center" valign="middle">2.0</td>
</tr>
<tr>
<td align="left" valign="middle">head</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="top">4</td>
<td align="center" valign="middle">4</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>CIFAR10-DVS contains 10&#x2009;K DVS images of 10 classes recorded with the dynamic vision sensor from the original static CIFAR10 dataset. We apply a 9: 1 train-valid split (i.e., 9&#x2009;k training images and 1&#x2009;k validation images). The resolution is 128&#x2009;&#x00D7;&#x2009;128, we resize all of them to 48&#x2009;&#x00D7;&#x2009;48 in our training and we integrate the event data into 10 frames per sample (<xref ref-type="bibr" rid="ref27">Li et al., 2017</xref>).</p>
<p>N-Caltech 101 dataset contains 8,831 DVS images converted from the original version of Caltech 101 with a slight change in object classes to avoid confusion. The N-Caltech 101 consists of 100 object classes plus one background class. Similarly, we apply the 9: 1 train-test split as CIFAR10-DVS. We use the SpikingJelly (<xref ref-type="bibr" rid="ref12">Fang et al., 2020</xref>) package to process the data and integrate them into 14 frames per sample (<xref ref-type="bibr" rid="ref32">Orchard et al., 2015</xref>).</p>
<p>The neuromorphic MNIST dataset is a converted dataset from the original static MNIST dataset (<xref ref-type="bibr" rid="ref32">Orchard et al., 2015</xref>). It contains 50&#x2009;K training images and 10&#x2009;K validation images. We integrate the event data into 10 frames per sample using SpikingJelly (<xref ref-type="bibr" rid="ref12">Fang et al., 2020</xref>) package.</p>
</sec>
<sec id="sec16">
<label>4.1.2.</label>
<title>Networks</title>
<p>The network structures with STCA for different datasets are provided in <xref ref-type="table" rid="tab2">Table 2</xref> and the network architectures we use have been proven to perform quite well on each dataset. Specifically, for the CIFAR10-DVS dataset, we adopt a VGG11-like architecture. To mitigate the apparent overfitting on the CIFAR10-DVS dataset, we adopt the neuromorphic data augmentation, including horizontal Flipping and Mixup in each frame, which is also used in <xref ref-type="bibr" rid="ref62">Zhu et al. (2022)</xref> for training the same dataset. For the N-Caltech 101 dataset, we adopt the same architecture with <xref ref-type="bibr" rid="ref62">Zhu et al. (2022)</xref> and N-MNIST refers to PLIF <xref ref-type="bibr" rid="ref13">Fang et al. (2021)</xref>. The voting layers are implemented using average pooling for classification robustness.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>The network structures with STCA for different datasets.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Dataset</th>
<th align="left" valign="top">Network structure</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">N-MNIST</td>
<td align="left" valign="middle">Input-128C3-Neuron-MP2-128C3-Neuron-STCA-MP2-0.5DP-2048FC-Neuron-0.5DP-100FC-Neuron-Voting</td>
</tr>
<tr>
<td align="left" valign="middle">CIFAR10-DVS</td>
<td align="left" valign="middle">Input-64C3-Neuron-128C3-Neuron-AP2-256C3-Neuron-256C3-Neuron-STCA-AP2-512C3-Neuron -512C3-Neuron-STCA-AP2-512C3-Neuron-512C3-Neuron-AP2-10FC-Neuron</td>
</tr>
<tr>
<td align="left" valign="middle">N-Caltech 101</td>
<td align="left" valign="middle">64C3-Neuron-MP2-128C3-Neuron-MP2-256C3-Neuron-STCA-MP2-256C3-Neuron-STCA-MP2-512C3-Neuron-0.8DP-1024FC-Neuron-0.5DP-101FC-Neuron</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic>x</italic>C<italic>y</italic>/MP<italic>y</italic>/AP<italic>y</italic> denotes the Conv2D/MaxPooling/Avgpooling layer with output channel&#x2009;=&#x2009;<italic>x</italic>, and kernel size&#x2009;=&#x2009;<italic>y</italic>&#x2009;&#x00D7;&#x2009;<italic>y</italic>&#x2009;, &#x2009;<italic>n</italic>&#x2009;FC denotes the fully connected layer with output feature&#x2009;= <italic>n</italic>, MP<italic>y</italic> is the spiking dropout layer with dropout ratio <italic>m</italic>. BN follows behind all <italic>x</italic>C<italic>y</italic>.</p>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="sec17">
<label>4.2.</label>
<title>Comparison with existing state-of-the-art works</title>
<p><xref ref-type="table" rid="tab3">Table 3</xref> displays the accuracy performance of the proposed STCA-SNNs compared to other competing methods on three neuromorphic datasets, N-MNIST, CIFAR10-DVS, and N-Caltech 101. We mainly include direct training results of SNNs with signal transmission via binary spike. Among them, some works (<xref ref-type="bibr" rid="ref47">Wu et al., 2019</xref>; <xref ref-type="bibr" rid="ref52">Yao et al., 2021</xref>) replace binary spikes with floating-point spikes and maintain the same forward pipeline as SNNs to obtain enhanced classification accuracy. STCA-SNNs achieve better performance than existing state-of-the-art SNNs on all datasets. We first compare our method on the CIFAR10-DVS dataset. We continue to utilize MSE the loss function and the same network architecture as TCJA-SNN (<xref ref-type="bibr" rid="ref62">Zhu et al., 2022</xref>) and STSC-SNN (<xref ref-type="bibr" rid="ref56">Yu et al., 2022</xref>) to preserve the consistency of this work, and our method reaches 81.6% top-1 accuracy, improving the accuracy by 0.9% over TCJA-SNN (<xref ref-type="bibr" rid="ref62">Zhu et al., 2022</xref>). We also compare our method on N-Caltech 101dataset. Under the same condition as TCJA-SNN (<xref ref-type="bibr" rid="ref62">Zhu et al., 2022</xref>) with MSE the loss function, we get a 2.38% increase over it and outperform the comparable result. Finally, we test our algorithm on the N-MNIST dataset. As shown in <xref ref-type="table" rid="tab3">Table 3</xref>, most comparison works get over 99% accuracy. We use the same architecture as PLIF. Our STCA-SNN reaches the best accuracy of 99.67%.</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Accuracy performance comparison between the proposed method and the SOTA methods on different datasets.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top" rowspan="2">Method</th>
<th align="center" valign="top" rowspan="2">Binary spikes</th>
<th align="center" valign="top" colspan="2">N-MNIST</th>
<th align="center" valign="top" colspan="2">CIFAR10-DVS</th>
<th align="center" valign="top" colspan="2">N-Caltech 101</th>
</tr>
<tr>
<th align="center" valign="top">T</th>
<th align="center" valign="top">Acc. (%)</th>
<th align="center" valign="top">T</th>
<th align="center" valign="top">Acc. (%)</th>
<th align="center" valign="top">T</th>
<th align="center" valign="top">Acc. (%)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">tdBN (<xref ref-type="bibr" rid="ref51">Yang et al., 2021</xref>)</td>
<td align="center" valign="middle">&#x2713;</td>
<td align="center" valign="middle">&#x2013;</td>
<td align="char" valign="middle" char=".">&#x2013;</td>
<td align="center" valign="middle">10</td>
<td align="char" valign="middle" char=".">67.8</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">Rollout (<xref ref-type="bibr" rid="ref25">Kugele et al., 2020</xref>)</td>
<td align="center" valign="middle">&#x2713;</td>
<td align="center" valign="middle">32</td>
<td align="char" valign="middle" char=".">99.57</td>
<td align="center" valign="middle">48</td>
<td align="char" valign="middle" char=".">66.97</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">LIAF-Net (<xref ref-type="bibr" rid="ref47">Wu et al., 2019</xref>)</td>
<td align="center" valign="middle">x</td>
<td align="center" valign="middle">20</td>
<td align="char" valign="middle" char=".">99.13</td>
<td align="center" valign="middle">10</td>
<td align="char" valign="middle" char=".">70.4</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">ConvSNN (<xref ref-type="bibr" rid="ref42">Samadzadeh et al., 2023</xref>)</td>
<td align="center" valign="middle">&#x2713;</td>
<td align="center" valign="middle">-</td>
<td align="char" valign="middle" char=".">99.6</td>
<td align="center" valign="middle">-</td>
<td align="char" valign="middle" char=".">69.2</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">PLIF (<xref ref-type="bibr" rid="ref13">Fang et al., 2021</xref>)</td>
<td align="center" valign="middle">&#x2713;</td>
<td align="center" valign="middle">10</td>
<td align="char" valign="middle" char=".">99.61</td>
<td align="center" valign="middle">20</td>
<td align="char" valign="middle" char=".">74.80</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">TA-SNN (<xref ref-type="bibr" rid="ref52">Yao et al., 2021</xref>)</td>
<td align="center" valign="middle">x</td>
<td align="center" valign="top">&#x2013;</td>
<td align="char" valign="top" char=".">&#x2013;</td>
<td align="center" valign="middle">10</td>
<td align="char" valign="middle" char=".">72.0</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">SALT (<xref ref-type="bibr" rid="ref23">Kim and Panda, 2021</xref>)</td>
<td align="center" valign="middle">&#x2713;</td>
<td align="center" valign="top">&#x2013;</td>
<td align="char" valign="top" char=".">&#x2013;</td>
<td align="center" valign="middle">20</td>
<td align="char" valign="middle" char=".">67.1</td>
<td align="center" valign="middle">20</td>
<td align="center" valign="middle">55.0</td>
</tr>
<tr>
<td align="left" valign="middle">STSC-SNN (<xref ref-type="bibr" rid="ref56">Yu et al., 2022</xref>)</td>
<td align="center" valign="middle">&#x2713;</td>
<td align="center" valign="middle">10</td>
<td align="char" valign="middle" char=".">99.64</td>
<td align="center" valign="middle">10</td>
<td align="char" valign="middle" char=".">81.4<xref ref-type="table-fn" rid="tfn1"><sup>a</sup></xref></td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">TCJA-SNN (<xref ref-type="bibr" rid="ref62">Zhu et al., 2022</xref>)</td>
<td align="center" valign="middle">&#x2713;</td>
<td align="center" valign="top">&#x2013;</td>
<td align="char" valign="top" char=".">&#x2013;</td>
<td align="center" valign="middle">10</td>
<td align="char" valign="middle" char=".">80.7<xref ref-type="table-fn" rid="tfn1"><sup>a</sup></xref></td>
<td align="center" valign="middle">14</td>
<td align="center" valign="middle">78.5</td>
</tr>
<tr>
<td align="left" valign="middle">This work</td>
<td align="center" valign="middle">&#x2713;</td>
<td align="center" valign="middle">10</td>
<td align="char" valign="middle" char=".">99.67</td>
<td align="center" valign="middle">10</td>
<td align="char" valign="middle" char=".">81.6<xref ref-type="table-fn" rid="tfn1"><sup>a</sup></xref></td>
<td align="center" valign="middle">14</td>
<td align="center" valign="middle">80.88</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="tfn1">
<label>a</label>
<p>With data augmentation.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="sec18">
<label>4.3.</label>
<title>Ablation study</title>
<sec id="sec19">
<label>4.3.1.</label>
<title>Ablation study</title>
<p>We performed ablation experiments based on the PLIF neuron model to evaluate the effectiveness of the STCA module. For each dataset, we trained three types of SNNs: STCA-SNNs, TA-SNNs with temporal-wise attention module (<xref ref-type="bibr" rid="ref55">Yao et al., 2023c</xref>), and vanilla SNNs (PLIF-SNN) without any attention module. The SE attention employed by TA-SNNs in the temporal dimension and the Self-attention employed in this work are both non-local operators, thus, we compared the performance of these two classic non-local operators under the same experiment conditions. We followed the learning process described in section 4.1 for all ablation experiments, and the attention locations were identical for both TA-SNNs and STCA-SNNs. <xref ref-type="table" rid="tab4">Table 4</xref> shows that all STCA-SNNs outperformed vanilla SNNs on three event stream classification datasets, suggesting that the benefits of the STCA module are not limited to a specific dataset or architecture. Furthermore, <xref ref-type="fig" rid="fig3">Figure 3</xref> illustrates the accuracy performance trend of vanilla SNN, TA-SNN, and our proposed STCA-SNN over 1,000 epochs on the N-Caltech101 dataset. As the training epoch increased, our proposed STCA-SNN demonstrated comparable performance with TA-SNN. This indicates that our STCA module can enhance the representation ability of SNNs.</p>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Accuracy of vanilla SNN, TA-SNN, and STCA-SNN models on different datasets.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">N-MNIST</th>
<th align="center" valign="top">CIFAR10-DVS</th>
<th align="center" valign="top">N-Caltech 101</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">Vanilla SNN</td>
<td align="char" valign="middle" char=".">99.64</td>
<td align="char" valign="top" char=".">80.7</td>
<td align="char" valign="middle" char=".">79.40</td>
</tr>
<tr>
<td align="left" valign="middle">TA-SNN</td>
<td align="char" valign="middle" char=".">99.64</td>
<td align="char" valign="top" char=".">81.3</td>
<td align="char" valign="middle" char=".">80.76</td>
</tr>
<tr>
<td align="left" valign="middle">STCA-SNN</td>
<td align="char" valign="middle" char=".">99.67</td>
<td align="char" valign="top" char=".">81.6</td>
<td align="char" valign="middle" char=".">80.88</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Convergence of compared SNN methods on N-Caltech101 dataset.</p>
</caption>
<graphic xlink:href="fnins-17-1261543-g003.tif"/>
</fig>
</sec>
<sec id="sec20">
<label>4.3.2.</label>
<title>Discuss of pooling operations</title>
<p>To investigate the influence of the avg-pooling and max-pooling operation, we conducted several ablation studies. As is well known, avg-pooling can capture the degree information of target objects, while max-pooling can extract discriminative features of objects. As shown in <xref ref-type="fig" rid="fig4">Figure 4</xref>, the max-pooling operation contributes significantly to performance enhancement. Each experiment is run 3 times. Notably, the fusion of both pooling operations exhibits improved performance across all datasets examined, which means avg-pooling encoded global information can effectively compensate for the discriminative information encoded by max-pooling.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Accuracy of different datasets obtained by avg-pooling, max-pooling, and a combination of both. Each experiment is run 3 times.</p>
</caption>
<graphic xlink:href="fnins-17-1261543-g004.tif"/>
</fig>
</sec>
</sec>
</sec>
<sec sec-type="conclusions" id="sec21">
<label>5.</label>
<title>Conclusion</title>
<p>In this work, we propose the STCA-SNNs to enhance the temporal information processing capabilities of SNNs. The STCA module captures temporal dependencies across channels globally using self-attention, enabling the network to learn &#x2018;when&#x2019; to attend to &#x2018;what&#x2019;. We verified the performance of STCA-SNNs on various neuromorphic datasets across different architectures. The experimental results show that STCA-SNNs achieve competitive accuracy on N-MNIST, CIFAR10-DVS, and N-Caltech 101 datasets.</p>
</sec>
<sec sec-type="data-availability" id="sec22">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec sec-type="author-contributions" id="sec23">
<title>Author contributions</title>
<p>XW: Conceptualization, Investigation, Methodology, Software, Visualization, Writing &#x2013; original draft. YS: Funding acquisition, Supervision, Writing &#x2013; review &#x0026; editing. YZ: Supervision, Writing &#x2013; review &#x0026; editing. YJ: Supervision, Writing &#x2013; review &#x0026; editing. YB: Formal analysis, Validation, Writing &#x2013; review &#x0026; editing. XL: Formal analysis, Software, Validation, Visualization, Writing &#x2013; review &#x0026; editing. XY: Writing &#x2013; review &#x0026; editing.</p>
</sec>
</body>
<back>
<sec sec-type="funding-information" id="sec24">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was supported by the National Natural Science Foundation of China General Program (No. 82272130) and the National Natural Science Foundation of China Key Program (No. U22A20103).</p>
</sec>
<sec sec-type="COI-statement" id="sec25">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="sec100" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Amir</surname> <given-names>A.</given-names></name> <name><surname>Taba</surname> <given-names>B.</given-names></name> <name><surname>Berg</surname> <given-names>D.</given-names></name> <name><surname>Melano</surname> <given-names>T.</given-names></name> <name><surname>McKinstry</surname> <given-names>J.</given-names></name> <name><surname>Di Nolfo</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>A low power, fully event-based gesture recognition system</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>.</citation></ref>
<ref id="ref2"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Ba</surname> <given-names>J.</given-names></name> <name><surname>Mnih</surname> <given-names>V.</given-names></name> <name><surname>Kavukcuoglu</surname> <given-names>K.</given-names></name></person-group> (<year>2014</year>). <conf-name>Multiple object recognition with visual attention. In ICLR</conf-name>.</citation></ref>
<ref id="ref3"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Bellec</surname> <given-names>G.</given-names></name> <name><surname>Salaj</surname> <given-names>D.</given-names></name> <name><surname>Subramoney</surname> <given-names>A.</given-names></name> <name><surname>Legenstein</surname> <given-names>R.</given-names></name> <name><surname>Maass</surname> <given-names>W.</given-names></name></person-group> (<year>2018</year>). <article-title>Long short-term memory and learning-to-learn in networks of spiking neurons</article-title>. <conf-name>32nd conference on neural information processing systems</conf-name>.</citation></ref>
<ref id="ref4"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Bu</surname> <given-names>T.</given-names></name> <name><surname>Ding</surname> <given-names>J.</given-names></name> <name><surname>Yu</surname> <given-names>Z.</given-names></name> <name><surname>Huang</surname> <given-names>T.</given-names></name></person-group> (<year>2022</year>). <conf-name>Optimized potential initialization for low-latency spiking neural networks, the thirty-sixth AAAI conference on artificial intelligence (AAAI)</conf-name>.</citation></ref>
<ref id="ref6"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Cai</surname> <given-names>W.</given-names></name> <name><surname>Sun</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>R.</given-names></name> <name><surname>Cui</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Xia</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2023</year>). A spatial-channel-temporal-fused attention for spiking neural networks. IEEE transactions on Neural Networks and Learning Systems. arXiv:2209.10837.</citation></ref>
<ref id="ref7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Khosla</surname> <given-names>D.</given-names></name></person-group> (<year>2015</year>). <article-title>Spiking deep convolutional neural networks for energy-efficient object recognition</article-title>. <source>Int. J. Comput. Vis.</source> <volume>113</volume>, <fpage>54</fpage>&#x2013;<lpage>66</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11263-014-0788-3</pub-id></citation></ref>
<ref id="ref8"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Cheng</surname> <given-names>W.</given-names></name> <name><surname>Luo</surname> <given-names>H.</given-names></name> <name><surname>Yang</surname> <given-names>W.</given-names></name> <name><surname>Yu</surname> <given-names>L.</given-names></name> <name><surname>Chen</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>W.</given-names></name></person-group> (<year>2019</year>). <article-title>Det: a high-resolution dvs dataset for lane extraction</article-title>. <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops</conf-name>, pp. <fpage>1666</fpage>&#x2013;<lpage>1675</lpage>.</citation></ref>
<ref id="ref9"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Diehl</surname> <given-names>P.</given-names></name> <name><surname>Cook</surname> <given-names>M.</given-names></name></person-group> (<year>2015</year>). <article-title>Unsupervised learning of digit recognition using spike-timing-dependent plasticity</article-title>. <source>Front. Comput. Neurosci.</source> <volume>9</volume>:<fpage>99</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fncom.2015.00099</pub-id></citation></ref>
<ref id="ref10"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Ding</surname> <given-names>J.</given-names></name> <name><surname>Yu</surname> <given-names>Z.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name> <name><surname>Huang</surname> <given-names>T.</given-names></name></person-group> (<year>2021</year>). <article-title>Optimal ANN-SNN conversion for fast and accurate inference in deep spiking neural networks</article-title>. <conf-name>International joint conference on artificial intelligence</conf-name>.</citation></ref>
<ref id="ref11"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Dosovitskiy</surname> <given-names>A.</given-names></name> <name><surname>Beyer</surname> <given-names>L.</given-names></name> <name><surname>Kolesnikov</surname> <given-names>A.</given-names></name> <name><surname>Weissenborn</surname> <given-names>D.</given-names></name> <name><surname>Zhai</surname> <given-names>X.</given-names></name> <name><surname>Unterthiner</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2020</year>). &#x201C;<article-title>An image is worth 16x16 words: transformers for image recognition at scale</article-title>&#x201D; in <source>International conference on learning representations (ICLR)</source></citation></ref>
<ref id="ref12"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Fang</surname> <given-names>W.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Ding</surname> <given-names>J.</given-names></name> <name><surname>Chen</surname> <given-names>D.</given-names></name> <name><surname>Yu</surname> <given-names>Z.</given-names></name> <name><surname>Zhou</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2020</year>). Spikingjelly. Available at: <ext-link xlink:href="https://github.com/fangwei123456/spikingjelly" ext-link-type="uri">https://github.com/fangwei123456/spikingjelly</ext-link>.</citation></ref>
<ref id="ref13"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Fang</surname> <given-names>W.</given-names></name> <name><surname>Yu</surname> <given-names>Z.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Masquelier</surname> <given-names>T.</given-names></name> <name><surname>Huang</surname> <given-names>T.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name></person-group> (<year>2021</year>). <article-title>Incorporating learnable membrane time constant to enhance learning of spiking neural networks</article-title>. <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, <fpage>2661</fpage>&#x2013;<lpage>2671</lpage>.</citation></ref>
<ref id="ref14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gallego</surname> <given-names>G.</given-names></name> <name><surname>Delbruck</surname> <given-names>T.</given-names></name> <name><surname>Orchard</surname> <given-names>G.</given-names></name> <name><surname>Bartolozzi</surname> <given-names>C.</given-names></name> <name><surname>Taba</surname> <given-names>B.</given-names></name> <name><surname>Censi</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Event-based vision: a survey</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>44</volume>:<fpage>1</fpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2020.3008413</pub-id></citation></ref>
<ref id="ref15"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Gerstner</surname> <given-names>W.</given-names></name> <name><surname>Kistler</surname> <given-names>W. M.</given-names></name> <name><surname>Naud</surname> <given-names>R.</given-names></name> <name><surname>Paninski</surname> <given-names>L.</given-names></name></person-group> (<year>2014</year>). <source>Neuronal dynamics: From single neurons to networks and models of cognition</source>, <publisher-name>Cambridge University Press</publisher-name>, <publisher-loc>Cambridge, MA</publisher-loc>.</citation></ref>
<ref id="ref16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>M. H.</given-names></name> <name><surname>Xu</surname> <given-names>T. X.</given-names></name> <name><surname>Liu</surname> <given-names>J. J.</given-names></name> <name><surname>Liu</surname> <given-names>Z. N.</given-names></name> <name><surname>Jiang</surname> <given-names>P. T.</given-names></name> <name><surname>Mu</surname> <given-names>T. J.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Attention mechanisms in computer vision: a survey</article-title>. <source>Comput. Visual Media</source> <volume>8</volume>, <fpage>331</fpage>&#x2013;<lpage>368</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s41095-022-0271-y</pub-id></citation></ref>
<ref id="ref17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>K.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>X.</given-names></name> <name><surname>Guo</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>A survey on vision transformer</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>45</volume>, <fpage>87</fpage>&#x2013;<lpage>110</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2022.3152247</pub-id>, PMID: <pub-id pub-id-type="pmid">35180075</pub-id></citation></ref>
<ref id="ref18"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Shen</surname> <given-names>L.</given-names></name> <name><surname>Sun</surname> <given-names>G.</given-names></name></person-group> (<year>2018</year>). <article-title>Squeeze-and-excitation networks</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>.</citation></ref>
<ref id="ref19"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Pan</surname> <given-names>G.</given-names></name></person-group> (<year>2021</year>). <article-title>Spiking deep residual networks</article-title>. <source>IEEETrans. Neural Netw. Learn. Syst.</source> <volume>34</volume>, <fpage>5200</fpage>&#x2013;<lpage>5205</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNNLS.2021.3119238</pub-id>, PMID: <pub-id pub-id-type="pmid">34723807</pub-id></citation></ref>
<ref id="ref20"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Pan</surname> <given-names>L.</given-names></name> <name><surname>Qing</surname> <given-names>Z.</given-names></name> <name><surname>Tang</surname> <given-names>M.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2022</year>). <source>TAda! oman. In ICLR</source>.</citation></ref>
<ref id="ref21"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Itti</surname> <given-names>L.</given-names></name> <name><surname>Koch</surname> <given-names>C.</given-names></name> <name><surname>Niebur</surname> <given-names>E.</given-names></name></person-group> (<year>1998</year>). <article-title>A model of saliency-based visual attention for rapid scene analysis</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>20</volume>, <fpage>1254</fpage>&#x2013;<lpage>1259</lpage>. doi: <pub-id pub-id-type="doi">10.1109/34.730558</pub-id></citation></ref>
<ref id="ref22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kheradpisheh</surname> <given-names>S.</given-names></name> <name><surname>Mohammad</surname> <given-names>G.</given-names></name> <name><surname>Thorpe</surname> <given-names>S. J.</given-names></name> <name><surname>Masquelier</surname> <given-names>T.</given-names></name></person-group> (<year>2018</year>). <article-title>STDP-based spiking deep convolutional neural networks for object recognition</article-title>. <source>Neural Netw.</source> <volume>99</volume>, <fpage>56</fpage>&#x2013;<lpage>67</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neunet.2017.12.005</pub-id>, PMID: <pub-id pub-id-type="pmid">29328958</pub-id></citation></ref>
<ref id="ref23"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kim</surname> <given-names>Y.</given-names></name> <name><surname>Panda</surname> <given-names>P.</given-names></name></person-group> (<year>2021</year>). <article-title>Optimizing deeper spiking neural networks for dynamic vision sensing</article-title>. <source>Neural Netw.</source> <volume>144</volume>, <fpage>686</fpage>&#x2013;<lpage>698</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neunet.2021.09.022</pub-id>, PMID: <pub-id pub-id-type="pmid">34662827</pub-id></citation></ref>
<ref id="ref24"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Kingma</surname> <given-names>D. P.</given-names></name> <name><surname>Ba</surname> <given-names>J. L.</given-names></name></person-group> (<year>2015</year>). <article-title>Adam: a method for stochastic optimization</article-title>. <conf-name>ICLR 2015: International conference on learning representations</conf-name>.</citation></ref>
<ref id="ref25"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kugele</surname> <given-names>A.</given-names></name> <name><surname>Pfeil</surname> <given-names>T.</given-names></name> <name><surname>Pfeiffer</surname> <given-names>M.</given-names></name> <name><surname>Chicca</surname> <given-names>E.</given-names></name></person-group> (<year>2020</year>). <article-title>Efficient processing of spatio-temporal data streams with spiking neural networks</article-title>. <source>Front. Neurosci.</source> <volume>14</volume>:<fpage>439</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnins.2020.00439</pub-id>, PMID: <pub-id pub-id-type="pmid">32431592</pub-id></citation></ref>
<ref id="ref26"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>G.</given-names></name> <name><surname>Fang</surname> <given-names>Q.</given-names></name> <name><surname>Zha</surname> <given-names>L.</given-names></name> <name><surname>Gao</surname> <given-names>X.</given-names></name> <name><surname>Zheng</surname> <given-names>N.</given-names></name></person-group> (<year>2022</year>). <article-title>HAM: hybrid attention module in deep convolutional neural networks for image classification</article-title>. <source>Pattern Recogn.</source> <volume>129</volume>:<fpage>108785</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.patcog.2022.108785</pub-id></citation></ref>
<ref id="ref27"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>H.</given-names></name> <name><surname>Ji</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>G.</given-names></name> <name><surname>Shi</surname> <given-names>L.</given-names></name></person-group> (<year>2017</year>). <article-title>Cifar10-dvs: an event-stream dataset for object classification</article-title>. <source>Front. Neurosci.</source> <volume>11</volume>:<fpage>309</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnins.2017.00309</pub-id>, PMID: <pub-id pub-id-type="pmid">28611582</pub-id></citation></ref>
<ref id="ref28"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lichtsteiner</surname> <given-names>P.</given-names></name> <name><surname>Posch</surname> <given-names>C.</given-names></name> <name><surname>Delbruck</surname> <given-names>T.</given-names></name></person-group> (<year>2008</year>). <article-title>A 128&#x00D7; 128 120 db 15 &#x03BC;s latency asynchronous temporal contrast vision sensor</article-title>. <source>IEEE J. Solid State Circuits</source> <volume>43</volume>, <fpage>566</fpage>&#x2013;<lpage>576</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JSSC.2007.914337</pub-id></citation></ref>
<ref id="ref29"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Lin</surname> <given-names>Y.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Hu</surname> <given-names>H.</given-names></name> <name><surname>Wei</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Swin transformer: hierarchical vision transformer using shifted windows</article-title>. <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision</conf-name>.</citation></ref>
<ref id="ref30"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mainen</surname> <given-names>Z. F.</given-names></name> <name><surname>Sejnowski</surname> <given-names>T.</given-names></name></person-group> (<year>1995</year>). <article-title>J, reliability of spike timing in neocortical neurons</article-title>. <source>Science</source> <volume>268</volume>, <fpage>1503</fpage>&#x2013;<lpage>1506</lpage>. doi: <pub-id pub-id-type="doi">10.1126/science.7770778</pub-id>, PMID: <pub-id pub-id-type="pmid">7770778</pub-id></citation></ref>
<ref id="ref31"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Neftci</surname> <given-names>E. O.</given-names></name> <name><surname>Mostafa</surname> <given-names>H.</given-names></name> <name><surname>Zenke</surname> <given-names>F.</given-names></name></person-group> (<year>2019</year>). <article-title>Surrogate gradient learning in spiking neural networks: bringing the power of gradient-based optimization to spiking neural networks</article-title>. <source>IEEE Signal Process. Mag.</source> <volume>36</volume>, <fpage>51</fpage>&#x2013;<lpage>63</lpage>. doi: <pub-id pub-id-type="doi">10.1109/MSP.2019.2931595</pub-id></citation></ref>
<ref id="ref32"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Orchard</surname> <given-names>G.</given-names></name> <name><surname>Jayawant</surname> <given-names>A.</given-names></name> <name><surname>Cohen</surname> <given-names>G. K.</given-names></name> <name><surname>Thakor</surname> <given-names>N.</given-names></name></person-group> (<year>2015</year>). <article-title>Converting static image datasets to spiking neuromorphic datasets using saccades</article-title>. <source>Front. Neurosci.</source> <volume>9</volume>:<fpage>437</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnins.2015.00437</pub-id></citation></ref>
<ref id="ref33"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ponulak</surname> <given-names>F.</given-names></name> <name><surname>Kasinski</surname> <given-names>A.</given-names></name></person-group> (<year>2010</year>). <article-title>Supervised learning in spiking neural networks with ReSuMe: sequence learning, classification, and spike shifting</article-title>. <source>Neural Comput.</source> <volume>22</volume>, <fpage>467</fpage>&#x2013;<lpage>510</lpage>. doi: <pub-id pub-id-type="doi">10.1162/neco.2009.11-08-901</pub-id></citation></ref>
<ref id="ref34"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Posch</surname> <given-names>C.</given-names></name> <name><surname>Matolin</surname> <given-names>D.</given-names></name> <name><surname>Wohlgenannt</surname> <given-names>R.</given-names></name></person-group> (<year>2010</year>). <article-title>A qvga 143 db dynamic range frame-free pwm image sensor with lossless pixel-level video compression and time-domain cds</article-title>. <source>IEEE J. Solid State Circuits</source> <volume>46</volume>, <fpage>259</fpage>&#x2013;<lpage>275</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JSSC.2010.2085952</pub-id></citation></ref>
<ref id="ref35"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rathi</surname> <given-names>N.</given-names></name> <name><surname>Roy</surname> <given-names>K.</given-names></name></person-group> (<year>2021</year>). <article-title>DIET-SNN: a low-latency spiking neural network with direct input encoding and leakage and threshold optimization</article-title>. <source>IEEE Trans. Neural Networks Learn. Syst.</source> <volume>34</volume>, <fpage>3174</fpage>&#x2013;<lpage>3182</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNNLS.2021.3111897</pub-id></citation></ref>
<ref id="ref36"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Rathi</surname> <given-names>N.</given-names></name> <name><surname>Srinivasan</surname> <given-names>G.</given-names></name> <name><surname>Panda</surname> <given-names>P.</given-names></name> <name><surname>Roy</surname> <given-names>K.</given-names></name></person-group> (<year>2020</year>). <article-title>Enabling deep spiking neural networks with hybrid conversion and spike timing dependent backpropagation</article-title>. <conf-name>International conference on learning representations</conf-name>.</citation></ref>
<ref id="ref37"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rebecq</surname> <given-names>H.</given-names></name> <name><surname>Ranftl</surname> <given-names>R.</given-names></name> <name><surname>Koltun</surname> <given-names>V.</given-names></name> <name><surname>Scaramuzza</surname> <given-names>D.</given-names></name></person-group> (<year>2019</year>). <article-title>High speed and high dynamic range video with an event camera</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>43</volume>, <fpage>1964</fpage>&#x2013;<lpage>1980</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1906.07165</pub-id></citation></ref>
<ref id="ref38"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Ridwan</surname> <given-names>I.</given-names></name> <name><surname>Cheng</surname> <given-names>H.</given-names></name></person-group>, <source>An event-based optical flow algorithm for dynamic vision sensors</source> (<year>2017</year>) <publisher-name>University of Lethbridge</publisher-name> <publisher-loc>Lethbridge</publisher-loc></citation></ref>
<ref id="ref39"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Rieke</surname> <given-names>F.</given-names></name> <name><surname>Warland</surname> <given-names>D.</given-names></name> <name><surname>Van Steveninck</surname> <given-names>R. D. R.</given-names></name> <name><surname>Bialek</surname> <given-names>W.</given-names></name></person-group> (<year>1999</year>). <source>Spikes: Exploring the neural code</source>. <publisher-name>MIT Press</publisher-name>, <publisher-loc>Cambridge, MA</publisher-loc>.</citation></ref>
<ref id="ref40"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Roy</surname> <given-names>K.</given-names></name> <name><surname>Jaiswal</surname> <given-names>A.</given-names></name> <name><surname>Panda</surname> <given-names>A.</given-names></name></person-group> (<year>2019</year>). <article-title>Towards spike-based machine intelligence with neuromorphic computing</article-title>. <source>Nature</source>. <volume>575</volume>, <fpage>607</fpage>&#x2013;<lpage>617</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41586-019-1677-2</pub-id>, PMID: <pub-id pub-id-type="pmid">31776490</pub-id></citation></ref>
<ref id="ref41"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rueckauer</surname> <given-names>B.</given-names></name> <name><surname>Lungu</surname> <given-names>I.</given-names></name> <name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Pfeiffer</surname> <given-names>M.</given-names></name> <name><surname>Liu</surname> <given-names>S.</given-names></name></person-group> (<year>2017</year>). <article-title>Conversion of continuous-valued deep networks to efficient event-driven networks for image classification</article-title>. <source>Front. Neurosci.</source> <volume>11</volume>:<fpage>682</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnins.2017.00682</pub-id>, PMID: <pub-id pub-id-type="pmid">29375284</pub-id></citation></ref>
<ref id="ref42"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Samadzadeh</surname> <given-names>A.</given-names></name> <name><surname>Far</surname> <given-names>F. S. T.</given-names></name> <name><surname>Javadi</surname> <given-names>A.</given-names></name> <name><surname>Nickabadi</surname> <given-names>A.</given-names></name> <name><surname>Chehreghani</surname> <given-names>M. H.</given-names></name></person-group> (<year>2023</year>). <article-title>Convolutional spiking neural networks for spatio-temporal feature extraction</article-title>. <source>Neural Processing Letters</source>. <fpage>1</fpage>&#x2013;<lpage>7</lpage>.</citation></ref>
<ref id="ref43"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sengupta</surname> <given-names>A.</given-names></name> <name><surname>Ye</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>R.</given-names></name> <name><surname>Liu</surname> <given-names>C.</given-names></name> <name><surname>Roy</surname> <given-names>K.</given-names></name></person-group> (<year>2019</year>). <article-title>Going deeper in spiking neural networks: VGG and residual architectures</article-title>. <source>Front. Neurosci.</source> <volume>13</volume>:<fpage>95</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnins.2019.00095</pub-id>, PMID: <pub-id pub-id-type="pmid">30899212</pub-id></citation></ref>
<ref id="ref44"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vaswani</surname> <given-names>A.</given-names></name> <name><surname>Shazeer</surname> <given-names>N.</given-names></name> <name><surname>Parmar</surname> <given-names>N.</given-names></name> <name><surname>Uszkoreit</surname> <given-names>J.</given-names></name> <name><surname>Jones</surname> <given-names>L.</given-names></name> <name><surname>Gomez</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>Attention is all you need</article-title>. <source>Adv. Neural Inf. Proces. Syst.</source> <volume>30</volume>, <fpage>5998</fpage>&#x2013;<lpage>6008</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id></citation></ref>
<ref id="ref45"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Girshick</surname> <given-names>R.</given-names></name> <name><surname>Gupta</surname> <given-names>A.</given-names></name> <name><surname>He</surname> <given-names>K.</given-names></name></person-group> (<year>2018</year>). <article-title>Non-local neural networks</article-title>. <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>.</citation></ref>
<ref id="ref46"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Woo</surname> <given-names>S.</given-names></name> <name><surname>Park</surname> <given-names>J.</given-names></name> <name><surname>Lee</surname> <given-names>J.</given-names></name> <name><surname>Kweon</surname> <given-names>I.</given-names></name></person-group> (<year>2018</year>). <article-title>Cbam: convolutional block attention module</article-title>. <conf-name>Proceedings of the European conference on computer vision (ECCV)</conf-name>.</citation></ref>
<ref id="ref47"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Y.</given-names></name> <name><surname>Deng</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>G.</given-names></name> <name><surname>Zhu</surname> <given-names>J.</given-names></name> <name><surname>Xie</surname> <given-names>Y.</given-names></name> <name><surname>Shi</surname> <given-names>L.</given-names></name></person-group> (<year>2019</year>). <conf-name>Direct training for spiking neural networks: Faster, larger, better, in Association for the Advancement of artificial intelligence (AAAI)</conf-name>.</citation></ref>
<ref id="ref48"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Xu</surname> <given-names>C.</given-names></name> <name><surname>Han</surname> <given-names>X.</given-names></name> <name><surname>Zhou</surname> <given-names>D.</given-names></name> <name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Li</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Progressive tandem learning for pattern recognition with deep spiking neural networks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>44</volume>, <fpage>7824</fpage>&#x2013;<lpage>7840</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2021.3114196</pub-id></citation></ref>
<ref id="ref49"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>X.</given-names></name> <name><surname>Zhao</surname> <given-names>Y.</given-names></name> <name><surname>Song</surname> <given-names>Y.</given-names></name> <name><surname>Jiang</surname> <given-names>Y.</given-names></name> <name><surname>Bai</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Dynamic threshold integrate and fire neuron model for low latency spiking neural networks</article-title>. <source>Neurocomputing</source> <volume>544</volume>:<fpage>126247</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neucom.2023.126247</pub-id></citation></ref>
<ref id="ref50"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>Q.</given-names></name> <name><surname>Qi</surname> <given-names>Y.</given-names></name> <name><surname>Yu</surname> <given-names>H.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Pan</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Csnn: an augmented spiking based framework with perceptron-inception</article-title>. <conf-name>International Joint Conference on Artificial Intelligence (Stockholm)</conf-name>.</citation></ref>
<ref id="ref51"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>Z.</given-names></name> <name><surname>Wu</surname> <given-names>Y.</given-names></name> <name><surname>Deng</surname> <given-names>L.</given-names></name> <name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>G.</given-names></name></person-group> (<year>2021</year>). <article-title>Going deeper with directly-trained larger spiking neural networks</article-title>. <source>Neural Evol. Comput.</source> <volume>35</volume>, <fpage>11062</fpage>&#x2013;<lpage>11070</lpage>. doi: <pub-id pub-id-type="doi">10.1609/aaai.v35i12.17320</pub-id></citation></ref>
<ref id="ref52"><citation citation-type="confproc"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>M.</given-names></name> <name><surname>Gao</surname> <given-names>H.</given-names></name> <name><surname>Zhao</surname> <given-names>G.</given-names></name> <name><surname>Wang</surname> <given-names>D.</given-names></name> <name><surname>Lin</surname> <given-names>Y.</given-names></name> <name><surname>Yang</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Temporal-wise attention spiking neural networks for event streams classification</article-title>. <conf-name>Proceedings of the IEEE/CVF international conference on computer vision (ICCV)</conf-name>.</citation></ref>
<ref id="ref53"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>M.</given-names></name> <name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Zhao</surname> <given-names>G.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Xu</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2023a</year>). <article-title>Inherent redundancy in spiking neural networks</article-title>. Proceeding of the IEEE/CVF international conference on computer vision (ICCV). arXiv preprint arXiv:2308.08227.</citation></ref>
<ref id="ref54"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>M.</given-names></name> <name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Zhou</surname> <given-names>Z.</given-names></name> <name><surname>Yuan</surname> <given-names>L.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name> <name><surname>Xu</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2023b</year>). Spike-driven Transformer. Advances in Neural Information Processing Systems (NeurIPS). arXiv preprint arXiv:2307.01694.</citation></ref>
<ref id="ref55"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>M.</given-names></name> <name><surname>Zhao</surname> <given-names>R.</given-names></name> <name><surname>Zhang</surname> <given-names>H.</given-names></name> <name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Deng</surname> <given-names>L.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name> <etal/></person-group> (<year>2023c</year>). <article-title>Attention spiking neural networks</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>45</volume>, <fpage>9393</fpage>&#x2013;<lpage>9410</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2023.3241201</pub-id></citation></ref>
<ref id="ref56"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>C.</given-names></name> <name><surname>Gu</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>D.</given-names></name> <name><surname>Wang</surname> <given-names>G.</given-names></name> <name><surname>Wang</surname> <given-names>A.</given-names></name> <name><surname>Li</surname> <given-names>E.</given-names></name></person-group> (<year>2022</year>). <article-title>STSC-SNN: Spatio-temporal synaptic connection with temporal convolution and attention for spiking neural networks</article-title>. <source>Front. Neurosci.</source> <volume>16</volume>:<fpage>1079357</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fnins.2022.1079357</pub-id></citation></ref>
<ref id="ref57"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zenke</surname> <given-names>F.</given-names></name> <name><surname>Vogels</surname> <given-names>T. P.</given-names></name></person-group> (<year>2021</year>). <article-title>The remarkable robustness of surrogate gradient learning for instilling complex function in spiking neural networks</article-title>. <source>Neural Comput.</source> <volume>33</volume>, <fpage>899</fpage>&#x2013;<lpage>925</lpage>. doi: <pub-id pub-id-type="doi">10.1162/neco_a_01367</pub-id>, PMID: <pub-id pub-id-type="pmid">33513328</pub-id></citation></ref>
<ref id="ref58"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Luo</surname> <given-names>X.</given-names></name> <name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Belatreche</surname> <given-names>A.</given-names></name> <name><surname>Pan</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>An efficient threshold-driven aggregate-label learning algorithm for multimodal information processing</article-title>. <source>IEEE J. Sel. Top Signal Process</source> <volume>14</volume>, <fpage>592</fpage>&#x2013;<lpage>602</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JSTSP.2020.2983547</pub-id></citation></ref>
<ref id="ref59"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Wu</surname> <given-names>J.</given-names></name> <name><surname>Belatreche</surname> <given-names>A.</given-names></name> <name><surname>Amornpaisannon</surname> <given-names>B.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Rectified linear postsynaptic potential function for backpropagation in deep spiking neural networks</article-title>. <source>IEEE Trans. Neural Netw. Learn Syst.</source> <volume>33</volume>, <fpage>1947</fpage>&#x2013;<lpage>1958</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TNNLS.2021.3110991</pub-id></citation></ref>
<ref id="ref60"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>C.</given-names></name> <name><surname>Yu</surname> <given-names>L.</given-names></name> <name><surname>Zhou</surname> <given-names>Z.</given-names></name> <name><surname>Ma</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>H.</given-names></name> <name><surname>Zhou</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2023</year>). Spikingformer: Spike-driven residual learning for transformer-based spiking neural network. arXiv preprint arXiv:2304.</citation></ref>
<ref id="ref61"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>Z.</given-names></name> <name><surname>Zhu</surname> <given-names>Y.</given-names></name> <name><surname>He</surname> <given-names>C.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Yan</surname> <given-names>S.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2023</year>). Spikformer: When spiking neural network meets transformer. ICLR, 2023. arXiv preprint arXiv:2209.15425.</citation></ref>
<ref id="ref62"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>R.</given-names></name> <name><surname>Zhao</surname> <given-names>Q.</given-names></name> <name><surname>Zhang</surname> <given-names>T.</given-names></name> <name><surname>Deng</surname> <given-names>H.</given-names></name> <name><surname>Duan</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2022</year>). TCJA-SNN: Temporal-Channel joint attention for spiking neural networks. arXiv:2206.10177.</citation></ref>
</ref-list>
</back>
</article>