<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurosci.</journal-id>
<journal-title>Frontiers in Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurosci.</abbrev-journal-title>
<issn pub-type="epub">1662-453X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnins.2019.00073</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>FLGR: Fixed Length Gists Representation Learning for RNN-HMM Hybrid-Based Neuromorphic Continuous Gesture Recognition</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Chen</surname> <given-names>Guang</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/191161/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Chen</surname> <given-names>Jieneng</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/589422/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Lienen</surname> <given-names>Marten</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="author-notes" rid="fn002"><sup>&#x02020;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/675900/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Conradt</surname> <given-names>J&#x000F6;rg</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/21060/overview"/>
</contrib>
<contrib contrib-type="author">
<name><surname>R&#x000F6;hrbein</surname> <given-names>Florian</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/54260/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Knoll</surname> <given-names>Alois C.</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/42313/overview"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>College of Automotive Engineering, Tongji University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Chair of Robotics, Artificial Intelligence and Real-time Systems, Technische Universit&#x000E4;t M&#x000FC;nchen</institution>, <addr-line>Munich</addr-line>, <country>Germany</country></aff>
<aff id="aff3"><sup>3</sup><institution>College of Electronics and Information Engineering, Tongji University</institution>, <addr-line>Shanghai</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>Department of Computational Science and Technology, KTH Royal Institute of Technology</institution>, <addr-line>Stockholm</addr-line>, <country>Sweden</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Runchun Mark Wang, Western Sydney University, Australia</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Hesham Mostafa, University of California, San Diego, United States; Arren Glover, Fondazione Istituto Italiano di Tecnologia, Italy</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Guang Chen <email>guang&#x00040;in.tum.de</email></corresp>
<corresp id="c002">Alois C. Knoll <email>knoll&#x00040;in.tum.de</email></corresp>
<fn fn-type="other" id="fn001"><p>This article was submitted to Neuromorphic Engineering, a section of the journal Frontiers in Neuroscience</p></fn>
<fn fn-type="other" id="fn002"><p>&#x02020;These authors have contributed equally to this work</p></fn></author-notes>
<pub-date pub-type="epub">
<day>12</day>
<month>02</month>
<year>2019</year>
</pub-date>
<pub-date pub-type="collection">
<year>2019</year>
</pub-date>
<volume>13</volume>
<elocation-id>73</elocation-id>
<history>
<date date-type="received">
<day>30</day>
<month>09</month>
<year>2018</year>
</date>
<date date-type="accepted">
<day>23</day>
<month>01</month>
<year>2019</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2019 Chen, Chen, Lienen, Conradt, R&#x000F6;hrbein and Knoll.</copyright-statement>
<copyright-year>2019</copyright-year>
<copyright-holder>Chen, Chen, Lienen, Conradt, R&#x000F6;hrbein and Knoll</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract><p>A neuromorphic vision sensors is a novel passive sensing modality and frameless sensors with several advantages over conventional cameras. Frame-based cameras have an average frame-rate of 30 fps, causing motion blur when capturing fast motion, e.g., hand gesture. Rather than wastefully sending entire images at a fixed frame rate, neuromorphic vision sensors only transmit the local pixel-level changes induced by the movement in a scene when they occur. This leads to advantageous characteristics, including low energy consumption, high dynamic range, a sparse event stream and low response latency. In this study, a novel representation learning method was proposed: Fixed Length Gists Representation (FLGR) learning for event-based gesture recognition. Previous methods accumulate events into video frames in a time duration (e.g., 30 ms) to make the accumulated image-level representation. However, the accumulated-frame-based representation waives the friendly event-driven paradigm of neuromorphic vision sensor. New representation are urgently needed to fill the gap in non-accumulated-frame-based representation and exploit the further capabilities of neuromorphic vision. The proposed FLGR is a sequence learned from mixture density autoencoder and preserves the nature of event-based data better. FLGR has a data format of fixed length, and it is easy to feed to sequence classifier. Moreover, an RNN-HMM hybrid was proposed to address the continuous gesture recognition problem. Recurrent neural network (RNN) was applied for FLGR sequence classification while hidden Markov model (HMM) is employed for localizing the candidate gesture and improving the result in a continuous sequence. A neuromorphic continuous hand gestures dataset (Neuro ConGD Dataset) was developed with 17 hand gestures classes for the community of the neuromorphic research. Hopefully, FLGR can inspire the study on the event-based highly efficient, high-speed, and high-dynamic-range sequence classification tasks.</p></abstract>
<kwd-group>
<kwd>representation learning</kwd>
<kwd>neuromorphic vision</kwd>
<kwd>continuous gesture recognition</kwd>
<kwd>mixture density autoencoder</kwd>
<kwd>recurrent neural network</kwd>
<kwd>hidden markov model</kwd>
</kwd-group>
<counts>
<fig-count count="8"/>
<table-count count="2"/>
<equation-count count="18"/>
<ref-count count="44"/>
<page-count count="13"/>
<word-count count="8224"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1. Introduction</title>
<p>Gesture recognition has aroused rising attentions because of its emerging significance in many robotic applications e.g., safe human-robot cooperation in an industrial environment. However, conventional camera-based gesture recognition exhibits two major drawbacks. First, the reaction speed of the conventional camera is limited by its frame rate, typically 30 fps, causing motion blur when capturing fast hand motions. Second, the accumulated-frame-based visual acquisition can lead to data redundancy and memory requirement, thereby hampering the large scale commercial usage in embedded systems. Compared with conventional cameras, neuromorphic vision sensors as a bio-inspired sensor do not capture full images at a fixed frame-rate. Besides, they characterized by high temporal resolution (microseconds), high dynamic range (120&#x02013;140 dB), low power and low bandwidth. Neuromorphic vision represents a paradigm shift in computer vision because of its principle of the operation and the unconventional output.</p>
<p>However, current study on neuromorphic gesture recognition all belongs to segmented gesture recognition. For segmented gesture recognition, the scenario of the problem can be simply described as classifying a well-delineated sequence of video frames as one of a set of gesture types. This is in contrast to continuous/online human gesture recognition where there are no a priori given boundaries of gesture execution (Aggarwal and Ryoo, <xref ref-type="bibr" rid="B2">2011</xref>; Wang et al., <xref ref-type="bibr" rid="B42">2018</xref>). It is meaningful to develop novel architecture for neuromorphic continuous gesture recognition, which is the first step to achieve online recognition.</p>
<p>However, given the events nature of variable length and asynchronous sequence, it is not suitable for feeding the events to common classifier directly for sequence classification tasks e.g., gesture recognition. Existing works accumulate neuromorphic sensor&#x00027;s output events in a duration (e.g., 30 ms), and denote them as image frame (Moeys et al., <xref ref-type="bibr" rid="B26">2016</xref>). These methods perform the classification and recognition task on an image level, thereby waiving the nature of events. Hence, new representations and technologies are urgently needed to exploit the capabilities of neuromorphic vision. The aim of this study was twofold: to explore a novel representation of neuromorphic events and to investigate the ability to translate successes in field of deep learning into neuromorphic vision in gesture recognition.</p>
<sec>
<title>1.1. Neuromorphic Vision Sensor</title>
<p>The dynamic vision sensor (DVS), a type of neuromorphic vision sensor (Lichtsteiner et al., <xref ref-type="bibr" rid="B24">2008</xref>), was employed to acquire the hand gesture data. The design of neuromorphic vision sensors is inspired by the way vision happens on the retina of a biological eye, e.g., the human eye, which is reflected in its eponymous attributes, including asynchronous and temporal contrast. The former indicates that each of the DVS pixels leads to an intensity change once it is triggered as opposed to the synchronous way in which a conventional camera queries all pixels at once every few milliseconds. The latter implied that a pixel is triggered when the variation in light intensity at its position exceeds a certain threshold. These attributes make the pixels of the DVS comparable to retinal ganglion cells.</p>
<p>The DVS applied here has a spatial resolution of 128 &#x000D7; 128 pixels as well as a temporal resolution of microseconds, suggesting that events are timestamped by a free-running counter ticking up at 11 kHz. Each pixel circuit tracks the temporal contrast defined as light log-intensity. An event is triggered every time the temporal contrast passes a threshold &#x003B8;. The whole process exhibits a latency of 15 &#x003BC;s. The DVS streams events over USB in address-event representation (AER). In AER, each event is a 4-tuple (<italic>t, x, y, p</italic>) where <italic>t</italic> denotes the timestamp; <italic>x</italic> and <italic>y</italic> are the coordinates of the event&#x00027;s origin; <italic>p</italic> is the event&#x00027;s polarity.</p>
</sec>
<sec>
<title>1.2. Representation for Neuromorphic Vision</title>
<p>Since the stream of neuromorphic events is asynchronous and variable in length, researchers tried to represent them as another type of data easy to process for later detection and recognition tasks. Existing methods for representation of DVS events are divided into 4 types, namely the fully accumulated-frame-based representation, the semi-accumulated-frame-based representation, the reconstructed-frame-based representation and the non-accumulated-frame-based representation. First, the fully accumulated frame-based representation is the most broadly used representation of neuromorphic events. Park et al. (<xref ref-type="bibr" rid="B36">2016</xref>) and Maqueda et al. (<xref ref-type="bibr" rid="B25">2018</xref>) accumulated the events into the frame with a duration of 30 ms in average. Vidal et al. (<xref ref-type="bibr" rid="B41">2018</xref>) collapsed every spatio-temporal window of events to a synthetic accumulated frame by drawing each event on the image frames. They used FAST corner detector to extract features on the frames. Second, the events were processed by the semi-accumulated-frame-based representation before being accumulated into a frame (Lee et al., <xref ref-type="bibr" rid="B22">2014</xref>; Mueggler et al., <xref ref-type="bibr" rid="B28">2015</xref>). Mueggler et al. (<xref ref-type="bibr" rid="B28">2015</xref>) processed the events by the lifetime estimation and accumulated them to yield the shape gradient image. Lee et al. (<xref ref-type="bibr" rid="B22">2014</xref>) processed the events by means of leaky integrate-and-fire (LIF) neurons and clustered a moving hand by accumulating the output events from LIF with a 3-ms interval. Third, Bardow et al. (<xref ref-type="bibr" rid="B5">2016</xref>) and Munda et al. (<xref ref-type="bibr" rid="B30">2018</xref>) exploited intensity change to reconstruct the gray image. However, noted that all three methods above process the events on an accumulated image frame level. Since the transformed images are often blurred and redundant, the image-level preprocessing negatively affects model performance and abandons the hardware friendly event-driven paradigm. As a result, such methods waive the the nature of events data and lead to unnecessary redundancy of data and memory requirement. In recent years, the processing of event sequence is no longer on an level of image, but more focused on the natural processing of event sequence (Neil et al., <xref ref-type="bibr" rid="B32">2016</xref>; Wu et al., <xref ref-type="bibr" rid="B44">2018</xref>). Wu et al. (<xref ref-type="bibr" rid="B44">2018</xref>) first trained an event-driven LSTM and prove the capability of recurrent neural network (RNN) to process event-based classification task. Note that they applied their framework on N-MNIST dataset, which is a toll dataset of handwritten digits. A review paper (Cadena et al., <xref ref-type="bibr" rid="B6">2016</xref>) highlighted that the main bottleneck of event-based computer vision is how to represent events sequence appropriately. Since the output consists of a sequence of asynchronous events, traditional frame-based computer-vision algorithms are not applicable. This requires a paradigm shift from the traditional computer vision approaches developed over the past 5 decades. They explained that the design goal of such algorithms is to preserve the event-based nature of the sensor. Thus, it is necessary to further prove the capability of the non-accumulated-image-based representation by applying them to event-driven tasks.</p>
</sec>
<sec>
<title>1.3. Related Works</title>
<p>Under the recent development of deep learning (Krizhevsky et al., <xref ref-type="bibr" rid="B20">2012</xref>), many methods used for hand gesture recognition with conventional cameras have been presented based on Convolutional Neural Networks (ConvNets) (Ji et al., <xref ref-type="bibr" rid="B16">2013</xref>; Neverova et al., <xref ref-type="bibr" rid="B34">2014</xref>; Molchanov et al., <xref ref-type="bibr" rid="B27">2015</xref>; Knoller et al., <xref ref-type="bibr" rid="B19">2016</xref>; Sinha et al., <xref ref-type="bibr" rid="B39">2016</xref>) and RNN (Ohn-Bar and Trivedi, <xref ref-type="bibr" rid="B35">2014</xref>; Neverova et al., <xref ref-type="bibr" rid="B33">2016</xref>; Wu et al., <xref ref-type="bibr" rid="B43">2016</xref>). Among these frameworks, RNNs are attractive because they equip neural networks with memories for temporal tasks, and the introduction of gating units e.g., LSTM and GRU (Hochreiter and Schmidhuber, <xref ref-type="bibr" rid="B14">1997</xref>; Cho et al., <xref ref-type="bibr" rid="B8">2014</xref>) has significantly contributed to making the learning of these networks manageable. In general, deep-learning-based methods outperform traditional handcrafted-feature-based methods in gesture recognition task (Wang et al., <xref ref-type="bibr" rid="B42">2018</xref>).</p>
<p>All the efforts above rely on conventional cameras at fixed frame-rate. Conventional cameras will suffer from various motion-related artifacts (motion blur, rolling shutter, etc.) which may affect the performance for the rapid gesture recognition. In contrast, the event data generated by neuromorphic vision sensors are natural <italic>motion detectors</italic> and automatically filter out any temporally redundant information. The DVS is promising sensor for low latency and low bandwidth tasks. A robotic goal keeper was presented in Delbruck and Lang (<xref ref-type="bibr" rid="B10">2013</xref>) with a reaction time of 3 ms. Robot localization was demonstrated by Mueggler et al. (<xref ref-type="bibr" rid="B29">2014</xref>) using a DVS during high-speed maneuvers, in which rotational speed was measured up to 1, 200&#x000B0;/s during quadrotor flips. In the meantime, gesture recognition is vital for in human-robot interaction. Hence, the neuromorphic gesture recognition system is urgently needed.</p>
<p>Ahn et al. (<xref ref-type="bibr" rid="B3">2011</xref>) were one of the first groups to use the DVS for gesture recognition when detecting and distinguishing between the 3 throws of the classical rock-paper-scissors game. It is noteworthy that their work was published in <xref ref-type="bibr" rid="B3">2011</xref>, which predating the deep learning era. The DVS&#x00027; inventors performed gesture recognition with spiking neural networks and leaky integrate-and-fire (LIF) neurons (Gerstner and Kistler, <xref ref-type="bibr" rid="B12">2002</xref>; Lee et al., <xref ref-type="bibr" rid="B21">2012a</xref>,<xref ref-type="bibr" rid="B23">b</xref>, <xref ref-type="bibr" rid="B22">2014</xref>). Spiking neural networks (SNNs) are trainable models of the brain, thereby being suitable for neuromorphic sensors. In <xref ref-type="bibr" rid="B36">2016</xref> deep learning was first applied for gesture recognition with DVS (Park et al., <xref ref-type="bibr" rid="B36">2016</xref>). With super-resolution technology by spatiotemporal demosaicing on the event stream, they trained a GoogLeNet CNN with the reconstructed information to classify these temporal-fusion frames and decode the network output with an LSTM. Amir et al. (<xref ref-type="bibr" rid="B4">2017</xref>) processed a live DVS event stream with IBM TrueNorth, a natively event-based processor containing 1 million spiking neurons. Configured as a convolutional neural network (CNN), the TrueNorth chip identifies the onset of a gesture with a latency of 105 ms while consuming &#x0003C;200 mW.</p>
<p>In fact, continuous gesture recognition is a task totally different from the segmented gesture recognition. For the segmented gesture recognition (Lee et al., <xref ref-type="bibr" rid="B21">2012a</xref>; Amir et al., <xref ref-type="bibr" rid="B4">2017</xref>), the scenario of the problem can be summarized as classifying a well-delineated sequence of video frames as one of a set of gesture types. This is in contrast with the continuous/online human gesture recognition where there are no a priori given boundaries of gesture execution. In a simple case where a video is segmented to contain only one execution of a human gesture, the system aims to correctly classify the video into its gesture category. In more general and complex cases, the continuous recognition of human gestures must be performed to detect the starting and ending times of all occurring gestures from an input video (Aggarwal and Ryoo, <xref ref-type="bibr" rid="B2">2011</xref>). However, there has been no measurement till now for the detection performance in neuromorphic gesture recognition task. In brief, the continuous gesture recognition is the first step to reach online recognition though it is harder than the segmented gesture recognition (Wang et al., <xref ref-type="bibr" rid="B42">2018</xref>).</p>
<p>However, the non-accumulated-image-based representation for event-driven recognition has not aroused enough attention. Both methods, Park et al. (<xref ref-type="bibr" rid="B36">2016</xref>) and Amir et al. (<xref ref-type="bibr" rid="B4">2017</xref>), belong to the semi-accumulated-frame-based representation and train CNN on the frames. Moreover, the CNN in Amir et al. (<xref ref-type="bibr" rid="B4">2017</xref>) was based on a neuromorphic hardware, which is not fully accessible to scientific and academic fields.There has been no pure deep network that can process the sequence of non-accumulated-frame-based representation for the gesture recognition task. A deep network should be urgently designed to process events or non-accumulated-frame-based representation sequence to explore a paradigm shift in neuromorphic vision community (Cadena et al., <xref ref-type="bibr" rid="B6">2016</xref>). Because of the data nature of asynchronous, the direct raw event-based recognition might be unsatisfactory. How to learn a novel non-accumulated-frame-based representation for event-driven recognition therefore becomes a promising direction to reduce the noted negative effect and maximize the capability of the event-based sequence data.</p>
<p>The rest of this study is organized as follows: section 2 describes the preprocessing, the representation learning and RNN-HMM hybrid temporal classification for neuromorphic continuous gesture recognition. Section 3 verified the Neuro ConGD dataset collection, evaluation metrics and experimental results. Section 4 draws the conclusion of this study.</p>
</sec>
</sec>
<sec sec-type="methods" id="s2">
<title>2. Methods</title>
<p>In this section, the framework for neuromorphic continuous gesture recognition is to be described. The main idea of this study is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Overview of the framework for neuromorphic continuous gesture recognition. The autoencoder network was split into an encoder and a decoder, sharing information only along a single edge in the computational graph. The autoencoder was trained in an unsupervised way. The encoder part transform variable length events sequences of fixed duration into fixed-length vectors. The representation learning module learns a novel representation FLGR (Fixed Length Gists Representation). The FLGR sequences are then fed to the hybrid system with RNN and HMM to make a temporal classification.</p></caption>
<graphic xlink:href="fnins-13-00073-g0001.tif"/>
</fig>
<p>The framework consists of two major parts, namely representation learning and temporal classification. In section 2.1, how the events triggered from DVS were preprocessed is first introduced. In section 2.2, a specific type of network <italic>Mixture Density Autoencoder</italic> is proposed to learn an efficient representation directly. In section 2.3, an RNN-HMM hybrid system is proposed to compute a label sequence from an input. The RNN provides localized classifications of each sequence element while the HMM segments the input based on the basis of the RNN output and deduces the most likely label sequence.</p>
<sec>
<title>2.1. Event Preprocessing</title>
<p>The aim of preprocessing stage is to make the raw events data time-invariant, location-invariant and standardized. Each event was finally mapped from a 4-dimensional raw feature to a 6-dimensional preprepocessed feature in the end (see Equation 2).</p>
<p>To make the events sequence time invariant, a new variable &#x003B4;<italic>t</italic> was introduced, which is defined as the time passed since the previous event, i.e., &#x003B4;<italic>t</italic><sup>(<italic>i</italic>)</sup> &#x0003D; <italic>t</italic><sup>(<italic>i</italic>)</sup>&#x02212;<italic>t</italic><sup>(<italic>i</italic>&#x02212;1)</sup> with a value of 0 as the base case. In such a way, arbitrary timestamp of the previous event was replaced.</p>
<p>To make the data location-invariant, we keep track of a mean &#x003BC;<sub><italic>x</italic></sub> with exponentially decaying weights, which gives more weight to recent events and can be cheaply computed in a streaming context such as online recognition. &#x003BC;<sub><italic>x</italic></sub> that tracks a quantity <italic>x</italic> through continuous time is defined as</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:msup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:msup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow></mml:msup><mml:mo>&#x000B7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>x</italic><sup>(<italic>i</italic>)</sup> was observed at time <italic>t</italic><sup>(<italic>i</italic>)</sup>. The parameter &#x003B1; controls how much weight is placed in past data.</p>
<p>We keep two means for each of <italic>x</italic> and <italic>y</italic>, one with &#x003BB; &#x0003D; 1&#x000A0;s and another with &#x003BB; &#x0003D; 50&#x000A0;ms. The first was supposed to track the main movement of the hand, while the second was to track fast movement like individual fingers.</p>
<p>In general, the preprocessing mapped each event from a 4-dimensional raw feature to a 6-dimensional feature as follows</p>
<disp-formula id="E2"><label>(2)</label><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mo>&#x021A6;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003B4;</mml:mi><mml:msup><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mi>&#x003B4;</mml:mi><mml:msubsup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003BB;</mml:mi><mml:mo>=</mml:mo><mml:mstyle class="mbox"><mml:mn>1</mml:mn></mml:mstyle><mml:mspace width="0.3em" class="thinspace"/><mml:mstyle class="mbox"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mi>&#x003B4;</mml:mi><mml:msubsup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003BB;</mml:mi><mml:mo>=</mml:mo><mml:mstyle class="mbox"><mml:mn>50</mml:mn></mml:mstyle><mml:mspace width="0.3em" class="thinspace"/><mml:mstyle class="mbox"><mml:mtext>ms</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mi>&#x003B4;</mml:mi><mml:msubsup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003BB;</mml:mi><mml:mo>=</mml:mo><mml:mstyle class="mbox"><mml:mn>1</mml:mn></mml:mstyle><mml:mspace width="0.3em" class="thinspace"/><mml:mstyle class="mbox"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:mi>&#x003B4;</mml:mi><mml:msubsup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003BB;</mml:mi><mml:mo>=</mml:mo><mml:mstyle class="mbox"><mml:mn>50</mml:mn></mml:mstyle><mml:mspace width="0.3em" class="thinspace"/><mml:mstyle class="mbox"><mml:mtext>ms</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M3"><mml:mi>&#x003B4;</mml:mi><mml:msubsup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003BB;</mml:mi><mml:mo>=</mml:mo><mml:mstyle class="mbox"><mml:mn>1</mml:mn></mml:mstyle><mml:mspace width="0.3em" class="thinspace"/><mml:mstyle class="mbox"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msubsup><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x003BB;</mml:mi><mml:mo>=</mml:mo><mml:mstyle class="mbox"><mml:mn>1</mml:mn></mml:mstyle><mml:mspace width="0.3em" class="thinspace"/><mml:mstyle class="mbox"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula>, <inline-formula><mml:math id="M4"><mml:mi>&#x003B4;</mml:mi><mml:msubsup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003BB;</mml:mi><mml:mo>=</mml:mo><mml:mstyle class="mbox"><mml:mn>50</mml:mn></mml:mstyle><mml:mspace width="0.3em" class="thinspace"/><mml:mstyle class="mbox"><mml:mtext>ms</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x003BB;</mml:mi><mml:mo>=</mml:mo><mml:mstyle class="mbox"><mml:mn>50</mml:mn></mml:mstyle><mml:mspace width="0.3em" class="thinspace"/><mml:mstyle class="mbox"><mml:mtext>ms</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula>,<inline-formula><mml:math id="M5"><mml:mi>&#x003B4;</mml:mi><mml:msubsup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003BB;</mml:mi><mml:mo>=</mml:mo><mml:mstyle class="mbox"><mml:mn>1</mml:mn></mml:mstyle><mml:mspace width="0.3em" class="thinspace"/><mml:mstyle class="mbox"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x003BB;</mml:mi><mml:mo>=</mml:mo><mml:mstyle class="mbox"><mml:mn>1</mml:mn></mml:mstyle><mml:mspace width="0.3em" class="thinspace"/><mml:mstyle class="mbox"><mml:mtext>s</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula>, <inline-formula><mml:math id="M6"><mml:mi>&#x003B4;</mml:mi><mml:msubsup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003BB;</mml:mi><mml:mo>=</mml:mo><mml:mstyle class="mbox"><mml:mn>50</mml:mn></mml:mstyle><mml:mspace width="0.3em" class="thinspace"/><mml:mstyle class="mbox"><mml:mtext>ms</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x003BB;</mml:mi><mml:mo>=</mml:mo><mml:mstyle class="mbox"><mml:mn>50</mml:mn></mml:mstyle><mml:mspace width="0.3em" class="thinspace"/><mml:mstyle class="mbox"><mml:mtext>ms</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:math></inline-formula>.</p>
</sec>
<sec>
<title>2.2. Representation Learning for FLGR</title>
<p>The aim of the representation learning stage focused on learning feature from the variable length events sequence. A mixture density network following the autoencoder architecture proposed in Cho et al. (<xref ref-type="bibr" rid="B8">2014</xref>) was utilized, which was originally employed for machine translation. Both the encoder and decoder of mixture density autoencoder consist of Gated Recurrent Units (GRU). The representations learned by the autoencoder is termed as <italic>Fixed Length Gist Representation (FLGR)</italic>. First, FLGR encode the gist of the input. Second, the variable length event sequences of fixed duration are transformed into fixed-length vector with the representation learning. We hope to inspire greater efforts along the lines of the non-accumulated-image-based representation research on neuromorphic vision.</p>
<sec>
<title>2.2.1. Mixture Density Autoencoder</title>
<p>The aim of the mixture density autoencoder is to learn a low-dimensional representation of the input data from which it can later reconstruct the input. Graves (<xref ref-type="bibr" rid="B13">2013</xref>) proposed mixture density network to generate handwriting sequence from a trained network by learning input sequence&#x00027;s distribution. The property of mixture density network was exploited to make the autoencoder transform variable length event sequences of fixed duration into fixed-length vectors.</p>
<p>The autoencoder network was split into an encoder and a decoder, sharing information only along a single edge in the computational graph (see <xref ref-type="fig" rid="F2">Figure 2</xref>). This edge initializes the decoders hidden state with the final hidden state of the encoder. It is the figurative funnel in the network as it has to encode the complete input sequence. The mixture density autoencoder was trained to produce a probability distribution over sequences rather than sequences directly. Our network processed an input sequence of length <italic>n</italic>, where <italic>n</italic> is variable, by encoding the complete sequence first. Subsequently, it used the decoder to produce a distribution over a sequence of length <italic>n</italic> and computing a loss between the two sequences for training. The mixture density networks output parameterizes a distribution, which is a mixture of Gaussians over the real attribute and a Bernoulli distribution over the categorical attribute. It is noteworthy that the outputs of our mixture density autoencoder are parameters of mixture distribution which are corresponding to the input events sequence. These parameters were used to reconstruct the sequence. During training we use encoder together as an autoencoder for the sequence and derive the training signal from the reconstruction error of the sequence. Then, we throw away the decoder and rely solely on the encoder to generate enriched, learned FLGR representation.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Sketch of our autoencoder architecture that encodes an input sequence <italic>x</italic> of length <italic>n</italic> into hidden states <italic>e</italic>. The decoder is trained to decode the last hidden state <italic>e</italic><sup>(<italic>n</italic>)</sup> into a sequence <italic>y</italic><sup>(1)</sup>, &#x02026;, <italic>y</italic><sup>(<italic>n</italic>)</sup> resembling the input. Each <italic>y</italic><sup>(<italic>i</italic>)</sup> is a non-negative vector whose entries sum up to 1 and its <italic>j</italic>-th entry encodes the networks belief that the <italic>j</italic>-th word should be placed at this point in the output sequence. Note that this is a sketch for intuitive understandability. Both encoder and decoder have 3 layers GRUs separately. Implementation details can be seen in section 2.4.</p></caption>
<graphic xlink:href="fnins-13-00073-g0002.tif"/>
</fig>
</sec>
<sec>
<title>2.2.2. Gated Recurrent Unit</title>
<p>A crucial property for our recurrent model refers to the ability to operate over input events sequence. In the proposed mixture density autoencoder, both encoder and decoder consist of 3 layers GRUs. Though works for sequence encoding and classification often leverage Long Short-Term Memory (LSTM) cells, it was reported that a GRU-based architecture exhibiting slightly better performance is more robust over a wider range of hyperparameters and has fewer parameters, suggesting the slightly faster training and better test-time performance. This is consistent with empirical findings from prior work on deep recurrent models in other domains (Jozefowicz et al., <xref ref-type="bibr" rid="B17">2015</xref>). GRU merged the cell state into the hidden state <italic>h</italic><sup>(<italic>t</italic>)</sup>, combined the input and forget gates into a single update gate <italic>z</italic><sup>(<italic>t</italic>)</sup> and replaced the output gate with a reset gate <italic>r</italic><sup>(<italic>t</italic>)</sup> with no equivalent in LSTM.</p>
<p>Thus, at each time step <italic>t</italic>, we took the hidden state <italic>h</italic><sup>(<italic>t</italic>)</sup> of the final GRU layer in the recurrent stage as our sequence encoding, where <italic>h</italic><sup>(<italic>t</italic>)</sup> is defined as:</p>
<disp-formula id="E3"><label>(3)</label><mml:math id="M7"><mml:msup><mml:mi>r</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mi>r</mml:mi></mml:msub><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mi>U</mml:mi><mml:mi>r</mml:mi></mml:msub><mml:msup><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E4"><label>(4)</label><mml:math id="M8"><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mi>z</mml:mi></mml:msub><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mi>U</mml:mi><mml:mi>z</mml:mi></mml:msub><mml:msup><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E5"><label>(5)</label><mml:math id="M9"><mml:msup><mml:mover accent='true'><mml:mi>h</mml:mi><mml:mo>&#x002DC;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>tanh</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>W</mml:mi><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:mi>U</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msup><mml:mi>r</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x025E6;</mml:mo><mml:msup><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:math></disp-formula>
<disp-formula id="E6"><label>(6)</label><mml:math id="M10"><mml:msup><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x025E6;</mml:mo><mml:msup><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x02212;</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x025E6;</mml:mo><mml:msup><mml:mover accent='true'><mml:mi>h</mml:mi><mml:mo>&#x002DC;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:math></disp-formula>
</sec>
</sec>
<sec>
<title>2.3. RNN-HMM Hybrid for Temporal Classification</title>
<p>The aim of temporal classification was to transform an event sequence to a sequence of 17 gesture labels. Wu et al. (<xref ref-type="bibr" rid="B44">2018</xref>) trained an event-driven RNN on DVS-MNIST dataset, verifying the capability of RNN to process the event-based classification task. RNNs consisting of LSTM units or GRU units are efficient methods for continuous gesture recognition (Chai et al., <xref ref-type="bibr" rid="B7">2016</xref>; Cui et al., <xref ref-type="bibr" rid="B9">2017</xref>). Moreover, the hybrid system combined with neural network and hidden Markov model (HMM) will significantly enhance the performance of temporal classification (Abdel-Hamid et al., <xref ref-type="bibr" rid="B1">2012</xref>; Gaikwad, <xref ref-type="bibr" rid="B11">2012</xref>). Based on the above information, an RNN-HMM hybrid for temporal neuromorphic continuous gesture classification was developed. Our RNN-HMM hybrid consists two modules: sequence classification with RNN to produce a distribution of labels and HMM to decode the distribution of labels into correct gesture label. Though this study focus on the case of gesture recognition, we hope to inspire more efforts on neuromorphic temporal classification tasks based on the proposed RNN-HMM hybrid.</p>
<sec>
<title>2.3.1. Sequence Classification With Recurrent Neural Network</title>
<p>The aim of sequence classification was to take an input sequence and produce a distribution of labels. A RNN was employed for event sequence classification. To classify a continuous gesture sequence, localized classifications were required to be decoded into global classifications. In other words, the network should assign each unit of input FLGR sequence to one of 17 classes. The 17 classes contain 16 gestures plus the <italic>blank</italic> label (see section 3.1 for the definition of gesture classes).</p>
<p><xref ref-type="fig" rid="F3">Figure 3</xref> shows the structure of our RNN network. The RNN consists of three GRU layers, two fully-connected layers with <italic>tahn</italic> activation and finally a fully-connected layer that projects the output down into &#x0211D;<sup>17</sup>. The definition of GRU and the reason to choose GRU instead of other recurrent units like LSTM are explained in section 2.2.2. In our RNN sequence classifier, the learning rate, decay rate, and neuron number of each GRU were set to 10<sup>&#x02212;3</sup>, 0.95, and 256, respectively. The loss function is cross entropy measuring the difference between the labels and predicted outputs. The output is transformed with SoftMax to parameterize a multinoulli distribution over the output classes.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>The RNN network consists of three layers of GRUs, each of which has N units, i.e., 256 units in our setting. The output of RNN is transformed with Softmax to parameterize a distribution over the classes. Finally the network can be trained to produce prediction y.</p></caption>
<graphic xlink:href="fnins-13-00073-g0003.tif"/>
</fig>
<p>The output of a sequence classifier is shown in <xref ref-type="fig" rid="F4">Figure 4</xref>. Since the <italic>blank</italic> class taked up nearly 50% of the training data, the classifier recognized non-gesture data with high accuracy. When an activity was detected, the classifier assigned high probabilities to multiple classes at first until it discerned a single label as the correct one.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Class probabilities attained from an RNN sequence classifier. The shaded regions in the background designate the ground truth (best viewed in color).</p></caption>
<graphic xlink:href="fnins-13-00073-g0004.tif"/>
</fig>
</sec>
<sec>
<title>2.3.2. HMM Decoding and Segmentation</title>
<p>The major goal of our HMM is to process the noisy classification produced by sequence classifier. Sequence classifier with RNN points out which gesture most likely happens at each point in time. However, there exist a huge amount of noisy classifications produced by the RNN sequence classifier (See <xref ref-type="fig" rid="F5">Figure 5</xref>). For instance, a <italic>swipe-down</italic> gesture might be classified as <italic>rotate-outward</italic> for the first few milliseconds, then <italic>swipe-up</italic> for another few and finally as <italic>swipe-down</italic> for the rest of the activity. Furthermore, this sequence of probability distributions should be deciphered into a single <italic>swipe-down</italic> label. The solution was reached using HMMs decoding (HMM decoder).</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>Noisy classifications produced by an RNN classifier (best viewed in color).</p></caption>
<graphic xlink:href="fnins-13-00073-g0005.tif"/>
</fig>
<p>A HMM consists of a Markov chain of hidden states <italic>z</italic><sup>(<italic>t</italic>)</sup> &#x02208; {1, &#x02026;, <italic>K</italic>}, an observation model <italic>x</italic><sup>(<italic>t</italic>)</sup> and a transition model expressed as transition matrix <italic>A</italic>. An HMM models a situation where a state of interest is only indirectly observable through emissions at each timestep. We have a sequence of local classifications of each frame into of 17 classes and would like to derive the true underlying sequence of gestures. An HMM helps us incorporate the knowledge that a state <italic>i</italic> might be observed as any other state for a short while through the observation matrix B. An efficient algorithm <italic>Viterbi decoding</italic> was employed to decode an observation sequence into the most likely underlying state sequence is. <italic>Viterbi decoding</italic> produces the most likely sequence of hidden states</p>
<disp-formula id="E7"><label>(7)</label><mml:math id="M11"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mtext>arg&#x000A0;max</mml:mtext></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo class="qopname">&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow></mml:munder></mml:mstyle><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo class="qopname">&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo stretchy="false">|</mml:mo><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo class="qopname">&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E8"><label>(8)</label><mml:math id="M12"><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mtext>arg&#x000A0;max</mml:mtext></mml:mrow><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:munder><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x000B7;</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x0220F;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mi>p</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x0007C;</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x000B7;</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x0220F;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mi>p</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x0007C;</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:math></disp-formula>
<disp-formula id="E9"><label>(9)</label><mml:math id="M13"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mtext>arg&#x000A0;max&#x000A0;</mml:mtext></mml:mrow><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:munder><mml:mtext>log&#x000A0;</mml:mtext><mml:msub><mml:mi>&#x003C0;</mml:mi><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mrow><mml:mtext>log&#x000A0;</mml:mtext></mml:mrow></mml:mstyle><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x0002B;</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mrow><mml:mtext>log&#x000A0;</mml:mtext></mml:mrow></mml:mstyle><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x0007C;</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>given a sequence of observations. Since the RNN classifier with softmax layer produces <inline-formula><mml:math id="M14"><mml:mrow><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x0007C;</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math></inline-formula> instead of <inline-formula><mml:math id="M15"><mml:mrow><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>&#x0007C;</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math></inline-formula>, the decoding objective can be rewritten in accordance with Bayes&#x00027; theorem.</p>
<disp-formula id="E10"><label>(10)</label><mml:math id="M16"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mtext>arg&#x000A0;max</mml:mtext></mml:mrow><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:munder><mml:mtext>log&#x000A0;</mml:mtext><mml:msub><mml:mi>&#x003C0;</mml:mi><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mrow><mml:mtext>log&#x000A0;</mml:mtext></mml:mrow></mml:mstyle><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x0002B;</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtext>log&#x000A0;</mml:mtext><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo stretchy="false">&#x0007C;</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x0002B;</mml:mo><mml:mtext>log&#x000A0;</mml:mtext><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x02212;</mml:mo><mml:mtext>log&#x000A0;</mml:mtext><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>The <inline-formula><mml:math id="M17"><mml:mrow><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math></inline-formula> term is irrelevant to the Argmax asit does not depend on <italic>z</italic>.</p>
<disp-formula id="E11"><label>(11)</label><mml:math id="M18"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mtext>arg&#x000A0;max</mml:mtext></mml:mrow><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:munder><mml:mtext>log&#x000A0;</mml:mtext><mml:msub><mml:mi>&#x003C0;</mml:mi><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mrow><mml:mtext>log&#x000A0;</mml:mtext></mml:mrow></mml:mstyle><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x0002B;</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtext>log&#x000A0;</mml:mtext><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo stretchy="false">&#x0007C;</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x02212;</mml:mo><mml:mtext>log&#x000A0;</mml:mtext><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E12"><label>(12)</label><mml:math id="M19"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mtext>arg&#x000A0;max</mml:mtext></mml:mrow><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:munder><mml:mtext>log&#x000A0;</mml:mtext><mml:msub><mml:mi>&#x003C0;</mml:mi><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>2</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mrow><mml:mtext>log&#x000A0;</mml:mtext></mml:mrow></mml:mstyle><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:msub></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x0002B;</mml:mo><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mrow><mml:mtext>log&#x000A0;</mml:mtext></mml:mrow></mml:mstyle><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>&#x0007C;</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>The Viterbi algorithm finds the maximizer by computing the probability of being in state <italic>j</italic> at time <italic>t</italic> since the most probable path is taken.</p>
<disp-formula id="E13"><label>(13)</label><mml:math id="M20"><mml:mrow><mml:msub><mml:mi>&#x003B4;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mi>max</mml:mi></mml:mrow><mml:mrow><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:munder><mml:mi>p</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mi>z</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy="false">&#x0007C;</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math></disp-formula>
<p>The key insight here is that the most probable path to state <italic>j</italic> at time <italic>t</italic> must be the one that maximizes the joint probability of being in state <italic>k</italic> at time <italic>t</italic> &#x02212; 1 and transitioning from <italic>k</italic> to <italic>j</italic>, i.e.,</p>
<disp-formula id="E14"><label>(14)</label><mml:math id="M21"><mml:mrow><mml:msub><mml:mi>&#x003B4;</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>j</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mtext>max&#x000A0;</mml:mtext></mml:mrow><mml:mi>i</mml:mi></mml:munder><mml:msub><mml:mi>&#x003B4;</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x000B7;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x000B7;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mi>B</mml:mi><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:msub></mml:mrow></mml:math></disp-formula>
<p>If computing &#x003B4; for <italic>t</italic> from 1 to <italic>n</italic> and store the maximizer <italic>i</italic> in another table &#x003B1;<sub><italic>tj</italic></sub>, you can find the most probable final state as <inline-formula><mml:math id="M22"><mml:msup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>arg&#x000A0;max</mml:mtext></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>&#x003B4;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> and work your way back to <italic>t</italic> &#x0003D; 1 by following the <inline-formula><mml:math id="M23"><mml:msub><mml:mrow><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>z</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow></mml:msub></mml:math></inline-formula> to the predecessor state and so forth. This explanation is summarized from Murphy (<xref ref-type="bibr" rid="B31">2012</xref>).</p>
<p>Since the the observation matrix <italic>B</italic> can be derived from the output of RNN classifier with SoftMax layer, the constructing process of an HMM decoder was reduced to find <italic>A</italic> and &#x003C0;.</p>
<p><xref ref-type="fig" rid="F6">Figure 6A</xref> shows that an HMM decoder is capable of recognizing the points of activity in a sequence of local classifications, and it is also reasonably accurate in decoding them into the true label.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>Comparison of the two decoding methods on the same recording. The shaded regions in the background denote the ground truth (best viewed in color). <bold>(A)</bold> Direct HMM decorder. <bold>(B)</bold> HMM segmenter plus segment decoding.</p></caption>
<graphic xlink:href="fnins-13-00073-g0006.tif"/>
</fig>
<p>However, there were numerous spurious labels mixed in with the true labels (see <xref ref-type="fig" rid="F6">Figure 6A</xref>). To solve the mixing problem, an HMM segmenter was developed, and the decoding process was devided into two parts, first, an HMM with just two states, <italic>gesture</italic> and <italic>blank</italic>, segments the sequence; subsequently, a second HMM produces a single label for each segment.</p>
<p>The HMM segmenter was constructed in the same way as the decoder with the twist that all gestures are combined into a single hidden state <italic>gesture</italic>. When the HMM segments a recording, the probability of the <italic>gesture</italic> state is the sum of all gesture probabilities. To suppress the remaining spurious activations, all segments shorter than 500 ms were also filtered out since we know from the dataset statistics that the shortest gesture is over a second long on average according to dataset statistic. <xref ref-type="fig" rid="F6">Figure 6B</xref> shows the contamination of mixing labels was almost gone compared with the result in <xref ref-type="fig" rid="F6">Figure 6A</xref>. Thus, the results were improved after implementing HMM segmenter.</p>
</sec>
</sec>
<sec>
<title>2.4. Network Training and Implementation</title>
<p>The aim of the training process is to estimate the model parameters in our architecture. During training we used encoder and decoder together as a mixture density autoencoder. We reconstructed the sequence by means of mixture distribution produced by autoencoder. We derive the training signal from the reconstruction error of the sequence. The encoder part of trained encoder-decoder was employed to generate FLGR data from variable events sequence. Given the sequence of FLGR, the hybrid system with RNN classifier and HMM was trained to predict the corresponding label.</p>
<p>The training events segment and batch were generated as follows: the time window <italic>T</italic><sub><italic>w</italic></sub> with a fixed duration was constructed as a segment. Events fell into different <italic>T</italic><sub><italic>w</italic></sub> with variable length <italic>L</italic><sub><italic>i</italic></sub>. The max value of <italic>L</italic><sub><italic>i</italic></sub> among different <italic>T</italic><sub><italic>w</italic></sub> was then computed as <italic>L</italic><sub><italic>max</italic></sub>. Each batch contains several <italic>T</italic><sub><italic>w</italic></sub> with the amount of batch size <italic>S</italic><sub><italic>batch</italic></sub>. The final data of a batch has the shape of (<italic>S</italic><sub><italic>batch</italic></sub>, <italic>L</italic><sub><italic>max</italic></sub>, <italic>S</italic><sub><italic>event</italic></sub>) where <italic>S</italic><sub><italic>event</italic></sub> denotes the feature size in each processed events. In our training, the <italic>T</italic><sub><italic>w</italic></sub>, <italic>S</italic><sub><italic>batch</italic></sub>, and <italic>S</italic><sub><italic>event</italic></sub> were set to 2.5 ms, 32 and 6, respectively.</p>
<p>In our implementation, the training procedure of our mixture density autoencoder is as follows. Both the encoder and decoder are 3 layers of GRUs with 256 neurons of each. The encoder receives preprocessed events in &#x0211D;<sup>6</sup>. The decoder produces parameters for a 10-component mixture distribution of Gaussians with diagonal covariance matrices over &#x0211D;<sup>5</sup> and a single parameter for a Bernoulli distribution over {&#x02212;1, 1}. This adds up to 10 component weights, 10&#x000B7;5 &#x0003D; 50 mean parameters, 10&#x000B7;5 &#x0003D; 50 diagonal convariance matrix entries and a single Bernoulli parameter, in total of 111 parameters. To project its 256-dimensional output into the &#x0211D;<sup>111</sup>, the decoder has a single fully-connected layer with weight matrix and bias term but without non-linearity on top. According to the output distribution, the loss is the negative log-likelihood of the input sequence. The network weights are learned using the mini-batch gradient decent with batch size 32. The optimizer is Adam (Kingma and Ba, <xref ref-type="bibr" rid="B18">2015</xref>) with a learning rate of 10<sup>&#x02212;4</sup> and an exponential decay rate of 0.95 to the power of the current epoch. The gradients were clipped at a norm of 5. This also helps to solve numerical instabilities if the covariances of the mixture distribution become really small.</p>
<p>For the RNN sequence classifier, the learning rate, decay rate, and neuron number of each GRU were set to 10<sup>&#x02212;3</sup>, 0.95, and 256, respectively. The loss function was cross entropy measuring the difference between the labels and predicted outputs.</p>
<p>The construction of an HMM decoder from the training data aimed to find <italic>A</italic> and &#x003C0;. We define the <italic>A</italic><sub><italic>i</italic>, 17</sub> entries, the transition probability from gesture <italic>i</italic> to <italic>blank</italic>, as the proportion of frames belonging to class <italic>i</italic> that transition to <italic>blank</italic> and <italic>A</italic><sub><italic>i, i</italic></sub>, the self-transition probability, as 1 &#x02212; <italic>Ai</italic>, 17. The transition probability from <italic>blank</italic> to any of the gestures was the proportion of gesture gists following blank gists, and the self-transition probability acted as the complementary part.</p>
<p>For the programming platform, a Titan X graphics card and an Intel Core i7-5930K processor were utilized for training, processing, and implementation.</p>
</sec>
</sec>
<sec id="s3">
<title>3. Experiments</title>
<p>In this section, the Neuromorphic Continuous Gesture Dataset (Neuro ConGD) and the evaluation protocol are to be described. The dataset contains the raw recorded events and the preprocessed data. The experimental results of the proposed method on this dataset were reported and compared with the baselines.</p>
<sec>
<title>3.1. Neuro ConGD Dataset</title>
<p>Numerous gesture datasets have been created in recent years, as thoroughly review in Ruffieux et al. (<xref ref-type="bibr" rid="B38">2014</xref>). Most of the datasets were recorded with frame-based camera, e.g., the conventional color camera, the stereo camera and the Kinect. Hu et al. (<xref ref-type="bibr" rid="B15">2016</xref>) reported the urgent need for neuromorphic dataset for further research in the event-based computer vision. One of the contributions of this study is that a new neuromorphic continuous gesture (Neuro ConGD) dataset was collected with an event-based camera.</p>
<p>The Neuro ConGD dataset was recorded with a DVS sensor which has a spatial resolution of 128x128 pixels. An appropriate distance between hand and DVS should be first selected to make gesture distinguishable from noise. <xref ref-type="fig" rid="F7">Figure 7</xref> shows the event rates of three recordings taken at three different distances between hand and DVS. A noise event rate of nearly 8 keps was measured when the DVS was directed toward a static scene which is the baseline rate between gestures regardless of the distance. The peaks in the event rate show that the event rate above baseline is proportional to the distance between hand and DVS. However, small distance makes the hand gesture leave the DVS&#x00027; field of view while recordings with a distance of over 80 cm are almost indistinguishable from noise. Accordingly, the distance was kept from 40 to 50 cm.</p>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>Event density for various distances between the hand and the DVS.</p></caption>
<graphic xlink:href="fnins-13-00073-g0007.tif"/>
</fig>
<p>Sixteen gesture classes were defined with an additional class <italic>blank</italic>, as listed in <xref ref-type="table" rid="T1">Table 1</xref>. Neuro ConGD dataset comprises 2,040 instances of a set of 17 gestures recorded in random order. The Neuro ConGD dataset was split into 3 mutually exclusive subsets, namely the training, the validation and the testing set. The training set was performed by 4 subjects. The validation set was performed by 2 subjects. The testing set was also performed by 2 subjects. The gestures include beckoning, finger-snap, ok, push-hand (down, left, right, up), rotate-outward, swipe (left, right, up), tap-index, thumbs-up, zoom (in, out) (See <xref ref-type="fig" rid="F8">Figure 8</xref>).</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Information of the Neuro ConGD dataset.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Set</bold></th>
<th valign="top" align="left"><bold>No. of labels</bold></th>
<th valign="top" align="left"><bold>No. of gestures</bold></th>
<th valign="top" align="left"><bold>No. of sequences</bold></th>
<th valign="top" align="left"><bold>No. of subjects</bold></th>
<th valign="top" align="left"><bold>Preprocessing provided</bold></th>
<th valign="top" align="left"><bold>Labels provided</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Training</td>
<td valign="top" align="left">17</td>
<td valign="top" align="left">1,360</td>
<td valign="top" align="left">80</td>
<td valign="top" align="left">4</td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="left">Yes</td>
</tr>
<tr>
<td valign="top" align="left">Validation</td>
<td valign="top" align="left">17</td>
<td valign="top" align="left">340</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="left">Yes</td>
</tr>
<tr>
<td valign="top" align="left">Testing</td>
<td valign="top" align="left">17</td>
<td valign="top" align="left">340</td>
<td valign="top" align="left">20</td>
<td valign="top" align="left">2</td>
<td valign="top" align="left">Yes</td>
<td valign="top" align="left">No</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F8" position="float">
<label>Figure 8</label>
<caption><p>A dataset overview demonstrating the 16 gestures, each of which contains 4 sub-figures of different timestamps (best viewed in color).</p></caption>
<graphic xlink:href="fnins-13-00073-g0008.tif"/>
</fig>
<p>A purpose-built labeling software was developed, and each recording was manually annotated by labeling a list of start and end timestamps for each gesture with the name of gesture class.</p>
</sec>
<sec>
<title>3.2. Evaluation Metrics</title>
<sec>
<title>3.2.1. Mean Jaccard Index for Overall Recognition Performance</title>
<p>The Jaccard index is to measure the average relative overlap between the actual and the predicted sequences of timestamps for a given gesture (Pigou et al., <xref ref-type="bibr" rid="B37">2018</xref>; Wang et al., <xref ref-type="bibr" rid="B42">2018</xref>). For a sequence <italic>s</italic>, let <italic>G</italic><sub><italic>s, i</italic></sub>and <italic>P</italic><sub><italic>s, i</italic></sub> be binary indicator vectors for which 1-values correspond to timestamps in which the <italic>i</italic><sup><italic>th</italic></sup> gesture/action label is being performed. For the sequence s, The Jaccard Index for the <italic>i</italic><sup><italic>th</italic></sup> class is defined as:</p>
<disp-formula id="E15"><mml:math id="M24"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02229;</mml:mo><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>G</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0222A;</mml:mo><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>G</italic><sub><italic>s, i</italic></sub> denoted the ground truth of the <italic>i</italic><sup><italic>th</italic></sup> gesture label in sequence <italic>s</italic>, and <italic>P</italic><sub><italic>s, i</italic></sub> is the prediction for the <italic>i</italic><sup><italic>th</italic></sup> label in sequence <italic>s</italic>. Subsequently, for the sequence <italic>s</italic> with <italic>l</italic><sub><italic>s</italic></sub> true labels, the Jaccard Index <italic>J</italic><sub><italic>s</italic></sub> is calculated by:</p>
<disp-formula id="E16"><label>(16)</label><mml:math id="M25"><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>J</mml:mi><mml:mi>s</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:msub><mml:mi>l</mml:mi><mml:mi>s</mml:mi></mml:msub></mml:mrow></mml:mfrac><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>L</mml:mi></mml:munderover><mml:mrow><mml:msub><mml:mi>J</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mstyle></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>For all test sequences <italic>S</italic> &#x0003D; <italic>s</italic><sub>1</sub>, &#x02026;, <italic>s</italic><sub><italic>n</italic></sub> with 17 gestures, the mean Jaccard Index <inline-formula><mml:math id="M26"><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:math></inline-formula> serves as the evaluation criteria, and it is calculated by:</p>
<disp-formula id="E17"><label>(17)</label><mml:math id="M27"><mml:mover accent='true'><mml:mrow><mml:msub><mml:mi>J</mml:mi><mml:mi>S</mml:mi></mml:msub></mml:mrow><mml:mo stretchy='true'>&#x000AF;</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>n</mml:mi></mml:mfrac><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>n</mml:mi></mml:munderover><mml:mrow><mml:msub><mml:mi>J</mml:mi><mml:mrow><mml:msub><mml:mi>s</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:mstyle></mml:math></disp-formula>
</sec>
<sec>
<title>3.2.2. F-Score for Detection Performance</title>
<p>One difficulty of continuous gesture recognition is to detect the start time point and end time point of a gesture. For the segmented gesture recognition, the scenario of the problem can be summarized as classifying a well-delineated sequence of video frames as one of a set of gesture types. This contrasts with continuous human gesture recognition where there is no priori given boundary of gesture execution. This requires the system to distinguish the blank and non-blank (gestures) class in each time point. To assess the detection performance, we keep the <italic>blank</italic> class and merge the rest 16 gestures be one class as <italic>Ges</italic>. Then, the task now is to detect non-blank gestures without recognizing the specific kind of class. In the prediction and ground truth, the value of <italic>blank</italic> and <italic>Ges</italic> are 0 and 1, respectively. Subsequently, the <italic>F-score</italic> measure (Sokolova and Lapalme, <xref ref-type="bibr" rid="B40">2009</xref>) is defined as:</p>
<disp-formula id="E18"><label>(18)</label><mml:math id="M28"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mtext>&#x000A0;score&#x000A0;</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x0002A;</mml:mo><mml:mfrac><mml:mrow><mml:mtext>&#x000A0;Precision&#x000A0;</mml:mtext><mml:mo>&#x0002A;</mml:mo><mml:mtext>&#x000A0;Recall&#x000A0;</mml:mtext></mml:mrow><mml:mrow><mml:mtext>&#x000A0;Precision&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>Recall</mml:mtext></mml:mrow></mml:mfrac></mml:math></disp-formula>
</sec>
</sec>
<sec>
<title>3.3. Experimental Results</title>
<p>To illustrate the effectiveness of FLGR representation, a baseline where the RNN sequence classifier are trained with variable length events sequences was designed. The proposed frameworks with protocol of mean Jaccard Index <inline-formula><mml:math id="M29"><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:math></inline-formula> and <italic>Fscore</italic> were assessed, and they were compared with baseline.</p>
<p><xref ref-type="table" rid="T2">Table 2</xref> shows the final results across combinations of input representation and decoding method. An RNN baseline with inputs of event sequences was designed. The case of baseline achieved 63.3 % <inline-formula><mml:math id="M30"><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:math></inline-formula> accuracy, which is reasonable and acceptable but still challenging. The case of baseline verified the fundamental capability of our RNN network in event-driven recognition. Our architecture was improved based on FLGR representation and late HMM decoding. After FLGR representation learning, the <inline-formula><mml:math id="M31"><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:math></inline-formula> accuracy was improved by more than 15%. The <italic>Fscore</italic> for detection result was improved to 94.4%. The best result was achieved on FLGR representaton learning with an RNN classifier and decoding method with an extra segmentation step. The averages of the best <inline-formula><mml:math id="M32"><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:math></inline-formula> and <italic>Fscore</italic> were up to 86.9 and 96.6%, respectively. For the cases among FLGR, the <inline-formula><mml:math id="M33"><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:math></inline-formula> accuracy is also improved by more than 8% after applying HMM segmenter. <xref ref-type="table" rid="T2">Table 2</xref> shows the large improvement after applying FLGR representation, which verifies the enhanced efficiency of FLGR representation for training a sequence classifier.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Performance measured on the testing dataset with mean Jaccard Index <inline-formula><mml:math id="M34"><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:math></inline-formula> and <italic>Fscore</italic>.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Methods</bold></th>
<th valign="top" align="center"><bold><inline-formula><mml:math id="M35"><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:math></inline-formula></bold></th>
<th valign="top" align="center"><bold><italic>Fscore</italic></bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Events&#x0002B;RNN (baseline)</td>
<td valign="top" align="center">0.633</td>
<td valign="top" align="center">0.873</td>
</tr>
<tr>
<td valign="top" align="left">FLGR&#x0002B;RNN</td>
<td valign="top" align="center">0.788</td>
<td valign="top" align="center">0.944</td>
</tr>
<tr>
<td valign="top" align="left">FLGR&#x0002B;RNN&#x0002B;Hmm</td>
<td valign="top" align="center">0.817</td>
<td valign="top" align="center">0.963</td>
</tr>
<tr>
<td valign="top" align="left"><bold>FLGR&#x0002B;RNN&#x0002B;HmmSeg</bold></td>
<td valign="top" align="center"><bold>0.869</bold></td>
<td valign="top" align="center"><bold>0.966</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p><italic><inline-formula><mml:math id="M36"><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:math></inline-formula> was measured for overall performance of recognizing 17 gestures. Fscore was measured for detection performance. The higher the value is, the better the method will perform. Events, Preprocessed events; RNN, Recurrent neural network for sequence classification; Hmm, HMM decoder; HmmSeg, HMM segmenter. The bold values mean the best results achieved among the listed methods</italic>.</p>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="s4">
<title>4. Conclusion and Discussion</title>
<p>In this study, a neuromorphic continuous gesture recognition system was proposed, and how it can benefit from FLGR representation learning and RNN-HMM hybrid was analyzed. A novel representation learning method was presented to learn non-accumulated-frame-based FLGR representation from DVS events streams. An RNN-HMM hybrid was proposed for the event-based sequence classification. A new labeled neuromorphic continuous gesture dataset Neuro ConGD was created with more than 2,040 instances of 17 gesture classes from 120 events sequences. An RNN classifier was developed as baseline, and the architecture with another 3 different paths on our dataset was improved. According to the experimental results, we could achieve an <inline-formula><mml:math id="M37"><mml:mover accent="false" class="mml-overline"><mml:mrow><mml:msub><mml:mrow><mml:mi>J</mml:mi></mml:mrow><mml:mrow><mml:mi>S</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo accent="true">&#x000AF;</mml:mo></mml:mover></mml:math></inline-formula> of 86.9% for recognition performance and an average <italic>Fscore</italic> of 96.6% for detection performance, with a mixture density autoencoder for FLGR representation learning, a RNN for sequence classification and an HMM segmentation process.</p>
<p>Compared with the conventional accumulated-frame-based representation of DVS events streams, FLGR marks two major contributions: First, FLGR is a sequence learned from mixture density autoencoder and preserve the nature of event-based data better. Second, FLGR has a data format of fixed length, and it is easy to feed to sequence classifier. With a preliminary result in this work, we believe that our FLGR representation learning and RNN-HMM hybrid is believed to have large potential to be transferred to neuromorphic vision in other pattern recognition and sequence classification tasks. We hope to inspire the research on the event-based sequence classification tasks with the non-accumulated-frame-based representation.</p>
<p>There are still several ways the recognition performance of this system can be improved. One idea would be to increase the information content of the learned representations at times of low event density. Then, the autoencoders state was reset to zero between each time window. This can be improved by using the autoencoder in a rolling fashion by not resetting the hidden states between time windows. This could help to classify stretches of time in gestures of low activity, e.g., the turning point of a swiping gesture. Another idea would be to use a bidirectional neural network so that the subsequent fully-connected layers can take past as well as future context into account and avoid the phase of confusion at the beginning of a gesture. that can incorporate requirements like a minimum length of a hidden state directly into the model instead of having to post-process the decoding and segmentations.</p>
</sec>
<sec id="s5">
<title>Author Contributions</title>
<p>GC, JNC, and ML contributed equally to this work. GC, JNC, ML, JC, FR, and AK did conception and design, analysis and interpretation of data, drafting and revising the article. ML and GC performed the experiments and acquired the data. JC provided the Dynamic Vision Sensor.</p>
<sec>
<title>Conflict of Interest Statement</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
</sec>
</body>
<back>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Abdel-Hamid</surname> <given-names>O.</given-names></name> <name><surname>Mohamed</surname> <given-names>A. R.</given-names></name> <name><surname>Jiang</surname> <given-names>H.</given-names></name> <name><surname>Penn</surname> <given-names>G.</given-names></name></person-group> (<year>2012</year>). <article-title>Applying convolutional neural networks concepts to hybrid nn-HMM model for speech recognition</article-title>, in <source>2012 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source> (<publisher-loc>Kyoto</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4277</fpage>&#x02013;<lpage>4280</lpage>.</citation></ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Aggarwal</surname> <given-names>J. K.</given-names></name> <name><surname>Ryoo</surname> <given-names>M. S.</given-names></name></person-group> (<year>2011</year>). <article-title>Human activity analysis: a review</article-title>. <source>ACM Comput. Surv.</source> <volume>43</volume>:<fpage>16</fpage>. <pub-id pub-id-type="doi">10.1145/1922649.1922653</pub-id></citation></ref>
<ref id="B3">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ahn</surname> <given-names>E. Y.</given-names></name> <name><surname>Lee</surname> <given-names>J. H.</given-names></name> <name><surname>Mullen</surname> <given-names>T.</given-names></name> <name><surname>Yen</surname> <given-names>J.</given-names></name></person-group> (<year>2011</year>). <article-title>Dynamic vision sensor camera based bare hand gesture recognition</article-title>, in <source>2011 IEEE Symposium on Computational Intelligence for Multimedia, Signal and Vision Processing (CIMSIVP)</source> (<publisher-loc>Paris</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>52</fpage>&#x02013;<lpage>59</lpage>.</citation></ref>
<ref id="B4">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Amir</surname> <given-names>A.</given-names></name> <name><surname>Taba</surname> <given-names>B.</given-names></name> <name><surname>Berg</surname> <given-names>D.</given-names></name></person-group> (<year>2017</year>). <article-title>A low power, fully event-based gesture recognition system</article-title>, in <source>IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Honolulu, HI</publisher-loc>).</citation></ref>
<ref id="B5">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Bardow</surname> <given-names>P.</given-names></name> <name><surname>Davison</surname> <given-names>A. J.</given-names></name> <name><surname>Leutenegger</surname> <given-names>S.</given-names></name></person-group> (<year>2016</year>). <article-title>Simultaneous optical flow and intensity estimation from an event camera</article-title>, in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Las Vegas, NV</publisher-loc>), <fpage>884</fpage>&#x02013;<lpage>892</lpage>.</citation></ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cadena</surname> <given-names>C.</given-names></name> <name><surname>Carlone</surname> <given-names>L.</given-names></name> <name><surname>Carrillo</surname> <given-names>H.</given-names></name> <name><surname>Latif</surname> <given-names>Y.</given-names></name> <name><surname>Scaramuzza</surname> <given-names>D.</given-names></name> <name><surname>Neira</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>Past, present, and future of simultaneous localization and mapping: toward the robust-perception age</article-title>. <source>IEEE Trans. Robot.</source> <volume>32</volume>, <fpage>1309</fpage>&#x02013;<lpage>1332</lpage>. <pub-id pub-id-type="doi">10.1109/TRO.2016.2624754</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Chai</surname> <given-names>X.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Yin</surname> <given-names>F.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Chen</surname> <given-names>X.</given-names></name></person-group> (<year>2016</year>). <article-title>Two streams recurrent neural networks for large-scale continuous gesture recognition</article-title>, in <source>2016 23rd International Conference on Pattern Recognition (ICPR)</source> (<publisher-loc>Cancun</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>31</fpage>&#x02013;<lpage>36</lpage>.</citation></ref>
<ref id="B8">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Cho</surname> <given-names>K.</given-names></name> <name><surname>van Merri&#x000EB;nboer</surname> <given-names>B.</given-names></name> <name><surname>G&#x000FC;l&#x000E7;ehre</surname> <given-names>&#x000C7;.</given-names></name> <name><surname>Bahdanau</surname> <given-names>D.</given-names></name> <name><surname>Bougares</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>Learning phrase representations using rnn encoder&#x02013;decoder for statistical machine translation</article-title>. in <source>Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)</source> (<publisher-loc>Doha</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>), <fpage>1724</fpage>&#x02013;<lpage>1734</lpage>.</citation></ref>
<ref id="B9">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Cui</surname> <given-names>R.</given-names></name> <name><surname>Liu</surname> <given-names>H.</given-names></name> <name><surname>Zhang</surname> <given-names>C.</given-names></name></person-group> (<year>2017</year>). <article-title>Recurrent convolutional neural networks for continuous sign language recognition by staged optimization</article-title>, in <source>IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Honolulu, HI</publisher-loc>).</citation></ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Delbruck</surname> <given-names>T.</given-names></name> <name><surname>Lang</surname> <given-names>M.</given-names></name></person-group> (<year>2013</year>). <article-title>Robotic goalie with 3 ms reaction time at 4% cpu load using event-based dynamic vision sensor</article-title>. <source>Front. Neurosci.</source> <volume>7</volume>:<fpage>223</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2013.00223</pub-id><pub-id pub-id-type="pmid">24311999</pub-id></citation></ref>
<ref id="B11">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gaikwad</surname> <given-names>K.</given-names></name></person-group> (<year>2012</year>). <article-title>HMM classifier for human activity recognition</article-title>. <source>Comput. Sci. Eng.</source> <volume>2</volume>:<fpage>27</fpage>. <pub-id pub-id-type="doi">10.5121/cseij.2012.2403</pub-id></citation></ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gerstner</surname> <given-names>W.</given-names></name> <name><surname>Kistler</surname> <given-names>W. M.</given-names></name></person-group> (<year>2002</year>). <article-title>Spiking neuron models: single neurons, populations, plasticity</article-title>. <source>Encyclopedia Neurosci.</source> <volume>4</volume>, <fpage>277</fpage>&#x02013;<lpage>280</lpage>. <pub-id pub-id-type="doi">10.1017/CBO9780511815706</pub-id></citation></ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Graves</surname> <given-names>A.</given-names></name></person-group> (<year>2013</year>). <article-title>Generating sequences with recurrent neural networks</article-title>. <source>arXiv preprint arXiv:1308.0850</source>.</citation></ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hochreiter</surname> <given-names>S.</given-names></name> <name><surname>Schmidhuber</surname> <given-names>J.</given-names></name></person-group> (<year>1997</year>). <article-title>Long short-term memory</article-title>. <source>Neural Comput.</source> <volume>9</volume>, <fpage>1735</fpage>&#x02013;<lpage>1780</lpage>. <pub-id pub-id-type="pmid">9377276</pub-id></citation></ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>H.</given-names></name> <name><surname>Pfeiffer</surname> <given-names>M.</given-names></name> <name><surname>Delbruck</surname> <given-names>T.</given-names></name></person-group> (<year>2016</year>). <article-title>DVS benchmark datasets for object tracking, action recognition, and object recognition</article-title>. <source>Front. Neurosci.</source> <volume>10</volume>:<fpage>405</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2016.00405</pub-id><pub-id pub-id-type="pmid">27630540</pub-id></citation></ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ji</surname> <given-names>S.</given-names></name> <name><surname>Xu</surname> <given-names>W.</given-names></name> <name><surname>Yang</surname> <given-names>M.</given-names></name></person-group> (<year>2013</year>). <article-title>3D convolutional neural networks for human action recognition</article-title>. <source>EEE Trans. Pattern Anal. Machine Intell.</source> <volume>35</volume>, <fpage>221</fpage>&#x02013;<lpage>231</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2012.59</pub-id><pub-id pub-id-type="pmid">22392705</pub-id></citation></ref>
<ref id="B17">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Jozefowicz</surname> <given-names>R.</given-names></name> <name><surname>Zaremba</surname> <given-names>W.</given-names></name> <name><surname>Sutskever</surname> <given-names>I.</given-names></name></person-group> (<year>2015</year>). <article-title>An empirical exploration of recurrent network architectures</article-title>, in <source>International Conference on Machine Learning</source> (<publisher-loc>Lille</publisher-loc>), <fpage>2342</fpage>&#x02013;<lpage>2350</lpage>.</citation></ref>
<ref id="B18">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Kingma</surname> <given-names>D.</given-names></name> <name><surname>Ba</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>Adam: a method for stochastic optimization</article-title>, in <source>International Conference on Learning Representations (ICLR)</source> (<publisher-loc>San Diego, CA</publisher-loc>).</citation></ref>
<ref id="B19">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Knoller</surname> <given-names>N.</given-names></name> <name><surname>Wolf</surname> <given-names>C. W.</given-names></name> <name><surname>Taylor</surname> <given-names>G.</given-names></name> <name><surname>Nebout</surname> <given-names>F.</given-names></name></person-group> (<year>2016</year>). <article-title>Deep hand: how to train a cnn on 1 million hand images when your data is continuous and weakly labelled</article-title>, in <source>IEEE Computer Society Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Las Vegas, NV</publisher-loc>), <fpage>3793</fpage>&#x02013;<lpage>3802</lpage>.</citation></ref>
<ref id="B20">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Krizhevsky</surname> <given-names>A.</given-names></name> <name><surname>Sutskever</surname> <given-names>I.</given-names></name> <name><surname>Hinton</surname> <given-names>G. E.</given-names></name></person-group> (<year>2012</year>). <article-title>Imagenet classification with deep convolutional neural networks</article-title>, in <source>Advances in Neural Information Processing Systems, NIPS 2012</source>, ed <person-group person-group-type="editor"><name><surname>Bartlett</surname> <given-names>P</given-names></name></person-group>. (<publisher-loc>Lake Tahoe, NV</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>) <fpage>1097</fpage>&#x02013;<lpage>1105</lpage>.</citation></ref>
<ref id="B21">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>J.</given-names></name> <name><surname>Delbruck</surname> <given-names>T.</given-names></name> <name><surname>Park</surname> <given-names>P. K.</given-names></name> <name><surname>Pfeiffer</surname> <given-names>M.</given-names></name> <name><surname>Shin</surname> <given-names>C.-W.</given-names></name> <name><surname>Ryu</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2012a</year>). <article-title>Live demonstration: Gesture-based remote control using stereo pair of dynamic vision sensors</article-title>, in <source>2012 IEEE International Symposium on Circuits and Systems (ISCAS)</source> (<publisher-loc>Seoul</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>741</fpage>&#x02013;<lpage>745</lpage>.</citation></ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>J. H.</given-names></name> <name><surname>Delbruck</surname> <given-names>T.</given-names></name> <name><surname>Pfeiffer</surname> <given-names>M.</given-names></name> <name><surname>Park</surname> <given-names>P. K.</given-names></name> <name><surname>Shin</surname> <given-names>C. W.</given-names></name> <etal/></person-group>. (<year>2014</year>). <article-title>Real-time gesture interface based on event-driven processing from stereo silicon retinas</article-title>. <source>IEEE Trans. Neural Netw. Learn. Syst.</source> <volume>25</volume>, <fpage>2250</fpage>&#x02013;<lpage>2263</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2014.2308551</pub-id><pub-id pub-id-type="pmid">25420246</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lee</surname> <given-names>J. H.</given-names></name> <name><surname>Park</surname> <given-names>P. K.</given-names></name> <name><surname>Shin</surname> <given-names>C.-W.</given-names></name> <name><surname>Ryu</surname> <given-names>H.</given-names></name> <name><surname>Kang</surname> <given-names>B. C.</given-names></name> <name><surname>Delbruck</surname> <given-names>T.</given-names></name></person-group> (<year>2012b</year>). <article-title>Touchless hand gesture ui with instantaneous responses</article-title>, in <source>2012 19th IEEE International Conference on Image Processing (ICIP)</source> (<publisher-loc>Orlando, FL</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1957</fpage>&#x02013;<lpage>1960</lpage>.</citation></ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lichtsteiner</surname> <given-names>P.</given-names></name> <name><surname>Posch</surname> <given-names>C.</given-names></name> <name><surname>Delbruck</surname> <given-names>T.</given-names></name></person-group> (<year>2008</year>). <article-title>A 128 &#x000D7; 128 120 db 15&#x003BC;s latency asynchronous temporal contrast vision sensor</article-title>. <source>IEEE J. Solid State Circ.</source> <volume>43</volume>, <fpage>566</fpage>&#x02013;<lpage>576</lpage>. <pub-id pub-id-type="doi">10.1109/JSSC.2007.914337</pub-id></citation></ref>
<ref id="B25">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Maqueda</surname> <given-names>A. I.</given-names></name> <name><surname>Loquercio</surname> <given-names>A.</given-names></name> <name><surname>Gallego</surname> <given-names>G.</given-names></name> <name><surname>Garc&#x00131;a</surname> <given-names>N.</given-names></name> <name><surname>Scaramuzza</surname> <given-names>D.</given-names></name></person-group> (<year>2018</year>). <article-title>Event-based vision meets deep learning on steering prediction for self-driving cars</article-title>, in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Salt Lake City, UT</publisher-loc>), <fpage>5419</fpage>&#x02013;<lpage>5427</lpage>.</citation></ref>
<ref id="B26">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Moeys</surname> <given-names>D. P.</given-names></name> <name><surname>Corradi</surname> <given-names>F.</given-names></name> <name><surname>Kerr</surname> <given-names>E.</given-names></name> <name><surname>Vance</surname> <given-names>P.</given-names></name> <name><surname>Das</surname> <given-names>G.</given-names></name> <name><surname>Neil</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>Steering a predator robot using a mixed frame/event-driven convolutional neural network</article-title>, in <source>2016 Second International Conference on Event-Based Control, Communication, and Signal Processing (EBCCSP)</source> (<publisher-loc>Krakow</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>8</lpage>.</citation></ref>
<ref id="B27">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Molchanov</surname> <given-names>P.</given-names></name> <name><surname>Gupta</surname> <given-names>S.</given-names></name> <name><surname>Kim</surname> <given-names>K.</given-names></name> <name><surname>Kautz</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>Hand gesture recognition with 3d convolutional neural networks</article-title>, in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition Workshops</source> (<publisher-loc>Boston, MA</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>7</lpage>.</citation></ref>
<ref id="B28">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Mueggler</surname> <given-names>E.</given-names></name> <name><surname>Forster</surname> <given-names>C.</given-names></name> <name><surname>Baumli</surname> <given-names>N.</given-names></name> <name><surname>Gallego</surname> <given-names>G.</given-names></name> <name><surname>Scaramuzza</surname> <given-names>D.</given-names></name></person-group> (<year>2015</year>). <article-title>Lifetime estimation of events from dynamic vision sensors</article-title>, in <source>2015 IEEE International Conference on Robotics and Automation (ICRA)</source> (<publisher-loc>Seattle, WA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4874</fpage>&#x02013;<lpage>4881</lpage>.</citation></ref>
<ref id="B29">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Mueggler</surname> <given-names>E.</given-names></name> <name><surname>Huber</surname> <given-names>B.</given-names></name> <name><surname>Scaramuzza</surname> <given-names>D.</given-names></name></person-group> (<year>2014</year>). <article-title>Event-based, 6-dof pose tracking for high-speed maneuvers</article-title>, in <source>2014 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2014)</source> (<publisher-loc>Chicago, IL</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2761</fpage>&#x02013;<lpage>2768</lpage>.</citation></ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Munda</surname> <given-names>G.</given-names></name> <name><surname>Reinbacher</surname> <given-names>C.</given-names></name> <name><surname>Pock</surname> <given-names>T.</given-names></name></person-group> (<year>2018</year>). <article-title>Real-time intensity-image reconstruction for event cameras using manifold regularisation</article-title>. <source>Int. J. Comput. Vis</source>. <volume>126</volume>, <fpage>1381</fpage>&#x02013;<lpage>1393</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-018-1106-2</pub-id></citation></ref>
<ref id="B31">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Murphy</surname> <given-names>K. P.</given-names></name></person-group> (<year>2012</year>). <source>Machine Learning: A Probabilistic Perspective</source>. <publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>MIT Press</publisher-name>.</citation></ref>
<ref id="B32">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Neil</surname> <given-names>D.</given-names></name> <name><surname>Pfeiffer</surname> <given-names>M.</given-names></name> <name><surname>Liu</surname> <given-names>S. C.</given-names></name></person-group> (<year>2016</year>). <article-title>Phased LSTM: accelerating recurrent network training for long or event-based sequences</article-title>, in <source>Advances in Neural Information Processing Systems, NIPS 2016</source>, ed <person-group person-group-type="editor"><name><surname>Lee</surname> <given-names>D. D</given-names></name></person-group>. (<publisher-loc>Barcelona</publisher-loc>: <publisher-name>Curran Associates, Inc.)</publisher-name>, <fpage>3882</fpage>&#x02013;<lpage>3890</lpage>.</citation></ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Neverova</surname> <given-names>N.</given-names></name> <name><surname>Wolf</surname> <given-names>C.</given-names></name> <name><surname>Taylor</surname> <given-names>G.</given-names></name> <name><surname>Nebout</surname> <given-names>F.</given-names></name></person-group> (<year>2016</year>). <article-title>Moddrop: adaptive multi-modal gesture recognition</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>38</volume>, <fpage>1692</fpage>&#x02013;<lpage>1706</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2015.2461544</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Neverova</surname> <given-names>N.</given-names></name> <name><surname>Wolf</surname> <given-names>C.</given-names></name> <name><surname>Taylor</surname> <given-names>G. W.</given-names></name> <name><surname>Nebout</surname> <given-names>F.</given-names></name></person-group> (<year>2014</year>). <article-title>Multi-scale deep learning for gesture detection and localization</article-title>, in <source>Workshop at the European Conference on Computer Vision</source> (<publisher-loc>Zurich</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>474</fpage>&#x02013;<lpage>490</lpage>.</citation></ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ohn-Bar</surname> <given-names>E.</given-names></name> <name><surname>Trivedi</surname> <given-names>M. M.</given-names></name></person-group> (<year>2014</year>). <article-title>Hand gesture recognition in real time for automotive interfaces: a multimodal vision-based approach and evaluations</article-title>. <source>IEEE Trans. Intell. Transport. Syst.</source> <volume>15</volume>, <fpage>2368</fpage>&#x02013;<lpage>2377</lpage>. <pub-id pub-id-type="doi">10.1109/TITS.2014.2337331</pub-id></citation></ref>
<ref id="B36">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Park</surname> <given-names>P. K.</given-names></name> <name><surname>Cho</surname> <given-names>B. H.</given-names></name> <name><surname>Park</surname> <given-names>J. M.</given-names></name> <name><surname>Lee</surname> <given-names>K.</given-names></name> <name><surname>Kim</surname> <given-names>H. Y.</given-names></name> <name><surname>Kang</surname> <given-names>H. A.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>Performance improvement of deep learning based gesture recognition using spatiotemporal demosaicing technique</article-title>, in <source>2016 IEEE International Conference on Image Processing (ICIP)</source> (<publisher-loc>Phoenix, AZ</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1624</fpage>&#x02013;<lpage>1628</lpage>.</citation></ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pigou</surname> <given-names>L.</given-names></name> <name><surname>Van Den Oord</surname> <given-names>A.</given-names></name> <name><surname>Dieleman</surname> <given-names>S.</given-names></name> <name><surname>Van Herreweghe</surname> <given-names>M.</given-names></name> <name><surname>Dambre</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <article-title>Beyond temporal pooling: recurrence and temporal convolutions for gesture recognition in video</article-title>. <source>Int. J. Comput. Vis.</source> <volume>126</volume>, <fpage>430</fpage>&#x02013;<lpage>439</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-016-0957-7</pub-id></citation></ref>
<ref id="B38">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ruffieux</surname> <given-names>S.</given-names></name> <name><surname>Lalanne</surname> <given-names>D.</given-names></name> <name><surname>Mugellini</surname> <given-names>E.</given-names></name> <name><surname>Khaled</surname> <given-names>O. A.</given-names></name></person-group> (<year>2014</year>). <article-title>A survey of datasets for human gesture recognition</article-title>, in <source>International Conference on Human-Computer Interaction</source> (<publisher-loc>Heraklion</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>337</fpage>&#x02013;<lpage>348</lpage>.</citation></ref>
<ref id="B39">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Sinha</surname> <given-names>A.</given-names></name> <name><surname>Choi</surname> <given-names>C.</given-names></name> <name><surname>Ramani</surname> <given-names>K.</given-names></name></person-group> (<year>2016</year>). <article-title>Deephand: Robust hand pose estimation by completing a matrix imputed with deep features</article-title>, in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Las Vegas, NV</publisher-loc>), <fpage>4150</fpage>&#x02013;<lpage>4158</lpage>.</citation></ref>
<ref id="B40">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sokolova</surname> <given-names>M.</given-names></name> <name><surname>Lapalme</surname> <given-names>G.</given-names></name></person-group> (<year>2009</year>). <article-title>A systematic analysis of performance measures for classification tasks</article-title>. <source>Inform. Process. Manage.</source> <volume>45</volume>, <fpage>427</fpage>&#x02013;<lpage>437</lpage>. <pub-id pub-id-type="doi">10.1016/j.ipm.2009.03.002</pub-id></citation></ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vidal</surname> <given-names>A. R.</given-names></name> <name><surname>Rebecq</surname> <given-names>H.</given-names></name> <name><surname>Horstschaefer</surname> <given-names>T.</given-names></name> <name><surname>Scaramuzza</surname> <given-names>D.</given-names></name></person-group> (<year>2018</year>). <article-title>Ultimate slam? combining events, images, and imu for robust visual slam in hdr and high-speed scenarios</article-title>. <source>IEEE Robot. Automat. Lett.</source> <volume>3</volume>, <fpage>994</fpage>&#x02013;<lpage>1001</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2018.2793357</pub-id></citation></ref>
<ref id="B42">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>P.</given-names></name> <name><surname>Li</surname> <given-names>W.</given-names></name> <name><surname>Ogunbona</surname> <given-names>P.</given-names></name> <name><surname>Wan</surname> <given-names>J.</given-names></name> <name><surname>Escalera</surname> <given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>RGB-D-based human motion recognition with deep learning: a survey</article-title>. <source>Comput. Vis. Image Understand</source>. <volume>171</volume>, <fpage>118</fpage>&#x02013;<lpage>139</lpage>. <pub-id pub-id-type="doi">10.1016/j.cviu.2018.04.007</pub-id></citation></ref>
<ref id="B43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>D.</given-names></name> <name><surname>Pigou</surname> <given-names>L.</given-names></name> <name><surname>Kindermans</surname> <given-names>Pieter-Jan</given-names></name></person-group> (<year>2016</year>). <article-title>Deep dynamic neural networks for multimodal gesture segmentation and recognition</article-title>. <source>IEEE Trans. Pattern Anal. Machine Intell.</source> <volume>38</volume>, <fpage>1583</fpage>&#x02013;<lpage>1597</lpage>. <pub-id pub-id-type="pmid">26955020</pub-id></citation></ref>
<ref id="B44">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>Y.</given-names></name> <name><surname>Deng</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>G.</given-names></name> <name><surname>Zhu</surname> <given-names>J.</given-names></name> <name><surname>Shi</surname> <given-names>L.</given-names></name></person-group> (<year>2018</year>). <article-title>Spatio-temporal backpropagation for training high-performance spiking neural networks</article-title>. <source>Front. Neurosci.</source> <volume>12</volume>:<fpage>331</fpage>. <pub-id pub-id-type="doi">10.3389/fnins.2018.00331</pub-id><pub-id pub-id-type="pmid">29875621</pub-id></citation></ref>
</ref-list>
<fn-group>
<fn fn-type="financial-disclosure"><p><bold>Funding.</bold> This work was supported by the German Research Foundation (DFG) and the Technical University of Munich (TUM) in the framework of the Open Access Publishing Program. The research leading to these results has partially received funding from the Shanghai Automotive Industry Sci-Tech Development Program under Grant Agreement No. 1838, from the European Union&#x00027;s Horizon 2020 Research and Innovation Program under Grant Agreement No. 785907 (HBP SGA2), and from the Fundamental Research Funds for the Central Universities of Tongji.</p>
</fn>
</fn-group>
</back>
</article>