<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Big Data</journal-id>
<journal-title>Frontiers in Big Data</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Big Data</abbrev-journal-title>
<issn pub-type="epub">2624-909X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fdata.2024.1363978</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Big Data</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Stable tensor neural networks for efficient deep learning</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Newman</surname> <given-names>Elizabeth</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2400017/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Horesh</surname> <given-names>Lior</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2618191/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Avron</surname> <given-names>Haim</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Kilmer</surname> <given-names>Misha E.</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Mathematics, Emory University</institution>, <addr-line>Atlanta, GA</addr-line>, <country>United States</country></aff>
<aff id="aff2"><sup>2</sup><institution>Mathematics and Theoretical Computer Science, IBM TJ Watson Research Center</institution>, <addr-line>Yorktown, NY</addr-line>, <country>United States</country></aff>
<aff id="aff3"><sup>3</sup><institution>Department of Applied Mathematics, Tel Aviv University</institution>, <addr-line>Tel Aviv-Yafo</addr-line>, <country>Israel</country></aff>
<aff id="aff4"><sup>4</sup><institution>Department of Mathematics, Tufts University</institution>, <addr-line>Medford, MA</addr-line>, <country>United States</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Yanqing Zhang, Yunnan University, China</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Yubai Yuan, The Pennsylvania State University (PSU), United States</p>
<p>Yuanyuan Ju, Kunming University of Science and Technology, China</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Elizabeth Newman <email>elizabeth.newman&#x00040;emory.edu</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>30</day>
<month>05</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>7</volume>
<elocation-id>1363978</elocation-id>
<history>
<date date-type="received">
<day>31</day>
<month>12</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>04</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2024 Newman, Horesh, Avron and Kilmer.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Newman, Horesh, Avron and Kilmer</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Learning from complex, multidimensional data has become central to computational mathematics, and among the most successful high-dimensional function approximators are deep neural networks (DNNs). Training DNNs is posed as an optimization problem to learn network weights or parameters that well-approximate a mapping from input to target data. Multiway data or tensors arise naturally in myriad ways in deep learning, in particular as input data and as high-dimensional weights and features extracted by the network, with the latter often being a bottleneck in terms of speed and memory. In this work, we leverage tensor representations and processing to efficiently parameterize DNNs when learning from high-dimensional data. We propose tensor neural networks (t-NNs), a natural extension of traditional fully-connected networks, that can be trained efficiently in a reduced, yet more powerful parameter space. Our t-NNs are built upon matrix-mimetic tensor-tensor products, which retain algebraic properties of matrix multiplication while capturing high-dimensional correlations. Mimeticity enables t-NNs to inherit desirable properties of modern DNN architectures. We exemplify this by extending recent work on stable neural networks, which interpret DNNs as discretizations of differential equations, to our multidimensional framework. We provide empirical evidence of the parametric advantages of t-NNs on dimensionality reduction using autoencoders and classification using fully-connected and stable variants on benchmark imaging datasets MNIST and CIFAR-10.</p></abstract>
<kwd-group>
<kwd>tensor algebra</kwd>
<kwd>deep learning</kwd>
<kwd>machine learning</kwd>
<kwd>image classification</kwd>
<kwd>inverse problems</kwd>
</kwd-group>
<counts>
<fig-count count="13"/>
<table-count count="4"/>
<equation-count count="37"/>
<ref-count count="55"/>
<page-count count="17"/>
<word-count count="11232"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Data Science</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>With the explosion of computing resources, including cloud-based storage and accessible advanced hardware, learning from large-scale, multiway data has become possible. Two distinct fields have emerged as the gold standards for handling multidimensional data: tensor analysis for featurization and compression and deep learning for high-dimensional function approximation. Both deep learning and tensor methods have achieved strong performance in image and video recognition (Vasilescu and Terzopoulos, <xref ref-type="bibr" rid="B53">2002</xref>; Krizhevsky et al., <xref ref-type="bibr" rid="B28">2012</xref>), medical imaging analysis (Omberg et al., <xref ref-type="bibr" rid="B42">2007</xref>; Ronneberger et al., <xref ref-type="bibr" rid="B46">2015</xref>), spatiotemporal weather analysis (Chattopadhyay et al., <xref ref-type="bibr" rid="B7">2020</xref>; Li et al., <xref ref-type="bibr" rid="B32">2020</xref>), and more. This work focuses on leveraging advantages of tensor methods to enhance deep learning design.</p>
<p>Fundamentally, deep learning approximates mappings from (high-dimensional) inputs (e.g., images) to targets (e.g., classes) using deep neural networks (DNNs), which are simply nonlinear, composite functions parameterized by learnable weights. Despite the success and flexibility of DNNs, the storage and computational costs to design and apply these models can be a significant impediment&#x02014;there can be millions of network weights and learning requires an immense amount of time and top-of-the-line computational hardware (e.g., GPU clusters).</p>
<p>These computational challenges become bottlenecks for the classic feed-forward neural network, which builds DNNs using dense linear operators (matrices). Such operations uses network weights in an highly inefficient manner, and composing many of these dense matrices can require millions of weights, which is both computationally demanding and can lead to algorithmic problems, such as overfitting. To reduce these inefficiencies, we propose a new type of fully-connected layer that replaces dense linear operators with dense tensor operators. The proposed tensor operators can reduce the number of network weights by an order of magnitude, that leverage the inherent multidimensionality of the input data, and offer the potential for distributed computation. Thus, we call our architecture tensor neural networks (t-NNs).</p>
<p>The foundation of t-NNs is the &#x022C6;<sub><italic>M</italic></sub>-product (pronounced &#x0201C;star-M&#x0201D;), a family of tensor-tensor products which induces an algebraic structure on a multidimensional space (Kernfeld et al., <xref ref-type="bibr" rid="B20">2015</xref>). The &#x022C6;<sub><italic>M</italic></sub>-framework provably encodes information more efficiently than traditional matrix algorithms (Kilmer et al., <xref ref-type="bibr" rid="B22">2021</xref>) and has had success facial recognition (Hao et al., <xref ref-type="bibr" rid="B15">2013</xref>), tomographic image reconstructions (Soltani et al., <xref ref-type="bibr" rid="B50">2016</xref>; Newman and Kilmer, <xref ref-type="bibr" rid="B39">2020</xref>), video completion (Zhang et al., <xref ref-type="bibr" rid="B55">2014</xref>), image classification (Newman et al., <xref ref-type="bibr" rid="B38">2018</xref>), and solving tensor linear systems (Ma and Molitor, <xref ref-type="bibr" rid="B34">2022</xref>). We call the &#x022C6;<sub><italic>M</italic></sub>-product <italic>matrix-mimetic</italic>; that is, familiar notions such as the identity and transpose are well-defined for the multilinear operation. The advantages of processing data multidimensionally including better leveraging inherit multiway structure and reducing the number of learnable network weights by an order of magnitude. The matrix-mimeticity enables the proposed t-NNs to naturally extend familiar deep learning concepts, such as backward propagation and loss functions, and non-trivial architectural designs to tensor space. We propose two additional extensions: tensor-based loss functions and a stable multidimensional framework, motivated by Haber and Ruthotto (<xref ref-type="bibr" rid="B13">2017</xref>), that brings topological advantages of featurization.</p>
<sec>
<title>1.1 Our contributions</title>
<p>Because of the popularity of this area of research, we want to clarify the objectives and contributions of this paper from the outset. Our contributions are the following:</p>
<list list-type="bullet">
<list-item><p>Tensor algebra and processing for efficient parameterization: we introduce a basic framework for t-NNs, describe the associated tensor algebra, and demonstrate the inherit properties from stable network architectures. We also derive the training algorithm for t-NNs, leveraging matrix-mimeticity for elegant formulations. We show that this tensor parameterization, compared to an equivalent matrix approach, can reduce the number of weights by an order of magnitude.</p></list-item>
<list-item><p>Tubal loss functions: our the algebraic structure imposed by the &#x022C6;<sub><italic>M</italic></sub>-product is applied end-to-end. This includes defining new loss functions based on the outputs of the t-NN, which are no longer scalars, but the high-dimensional analog called <italic>tubes</italic>. This requires a new definition of <italic>tubal functions</italic>, and opens the door to a wide range of new evaluation metrics. These metrics offer more rigorous requirements to fit the training data, and hence can yield networks that generalize better.</p></list-item>
<list-item><p>Stable t-NNs: we demonstrate how matrix-mimeticity preserves of desirable network architecture properties, specifically stability. This will enable the development of deeper, more expressive t-NNs.</p></list-item>
<list-item><p>Open-source code: for transparency and to expand the use of t-NNs, we provide open-source at <ext-link ext-link-type="uri" xlink:href="https://github.com/elizabethnewman/tnn">https://github.com/elizabethnewman/tnn</ext-link>.</p></list-item>
<list-item><p>Scope: our goal is to explore a new algebraic structure imposed on neural networks and its the advantages over equivalent architectures. This paper serves as the introduction of t-NNs and, similar to the original neural networks, we consider fully-connected layers only. We acknowledge that to obtain state-of-the-art results, we would need tools like convolutional and subsampling layers and significant hyperparameter tuning; however, these are outside the scope of this paper. Convolutional layers apply multiple translation-invariant filters to extract local connections; our t-NNs examine the global structure of the data. Subsampling or pooling layers reduce the dimensionality of our data and hence provide multi-scale features; our t-NNs use no pooling in order to preserve the algebraic structure. We address extensions of t-NNs to convolutional and subsampling layers in the conclusions.</p></list-item>
</list>
</sec>
<sec>
<title>1.2 Organization</title>
<p>This paper is organized as follows. In Section 2, we give a brief outline of related work combining tensors and deep learning. In Section 3, we give the background notation on tensor-tensor products. In Section 4, we formally introduce tensor neural networks (t-NNs) and tubal loss functions. In Section 5, we extend t-NNs to stable architectures and outline a Hamiltonian-inspired architecture. In Section 6, we provide numerical support for using t-NNs over comparable traditional fully-connected neural networks. In Section 7, we discuss future work including implementations for higher-order data and new t-NN designs.</p></sec>
</sec>
<sec id="s2">
<title>2 Related work</title>
<p>The high dimensional nature of neural network weights has driven the need to reduce the number of weights through structure. Early studies, such as LeCun et al. (<xref ref-type="bibr" rid="B31">1989</xref>), demonstrated that neural networks could learn faster from less data and generalize better by removing redundant weights. Following the observation, several works showed that structured weights, such as convolutions (Krizhevsky et al., <xref ref-type="bibr" rid="B28">2012</xref>), low rank weight matrices (Denil et al., <xref ref-type="bibr" rid="B11">2013</xref>), and Kronecker-structured matrices (Jagtap et al., <xref ref-type="bibr" rid="B18">2022</xref>), could perform well with significantly fewer parameters.</p>
<p>Tensor methods for compression high dimensional data and operators grew in popularity concurrently with the development of structured operators for neural networks. Many popular tensor frameworks are designed to featurize multiway arrays (Tucker, <xref ref-type="bibr" rid="B52">1966</xref>; Carroll and Chang, <xref ref-type="bibr" rid="B6">1970</xref>; Harshman, <xref ref-type="bibr" rid="B16">1970</xref>; de Lathauwer et al., <xref ref-type="bibr" rid="B10">2000</xref>; Kolda and Bader, <xref ref-type="bibr" rid="B25">2009</xref>) or to approximate a given high-dimensional operator (Oseledets, <xref ref-type="bibr" rid="B43">2011</xref>; Cichocki et al., <xref ref-type="bibr" rid="B9">2016</xref>). Because the weights and features of deep neural networks are notoriously high-dimensional, tensorized approaches have gained traction. In Novikov et al. (<xref ref-type="bibr" rid="B41">2015</xref>), the authors combine efficient tensor storage and processing schemes with DNN training, resulting up to seven times fewer network weights. This work specifically used the tensor train style of weight storage, which is notable for compression of very high dimensional data, but does not have linear algebraic motivations in this context. Further studies followed, such as Chien and Bao (<xref ref-type="bibr" rid="B8">2018</xref>) that used multiway operations to extract features convolutionally. This work computes a Tucker factorization of convolutional features rather than treating tensors as operators. Similar layer contraction approaches, called tensor regression layers, have appeared in works such as in Cao et al. (<xref ref-type="bibr" rid="B5">2017</xref>) and Kossaifi et al. (<xref ref-type="bibr" rid="B26">2020</xref>). These approaches utilize low-rank Tucker-based factorizations to successfully reduce the number of weights in a network without sacrificing performance. These are more similar in spirit to pooling layers of convolutional neural networks rather than operations that preserve multilinearity. Many more studies have connected tensors and neural networks, and we recommend the survey (Wang et al., <xref ref-type="bibr" rid="B54">2023</xref>) for a more complete history of the intersection of the two fields.</p>
<p>As we eluded to in the previous paragraph, in this work, we take a notably different perspective on tensors. We consider tensors as <italic>multiway operators</italic> and process our layers under this tensor operation. This provides a linear algebraic structure that enables us to extend desirable neural network structure to high dimensions with ease. Because of our strong algebraic foundation, we are able to express forward and backward propagation simply; in comparison, other tensor frameworks require heavy indexing notation. We share and achieve the same goal as other tensor approaches of reducing the number of network weights.</p></sec>
<sec id="s3">
<title>3 Background and preliminaries</title>
<p>To motivate our multidimensional neural network design, we start by introducing our notation and the tensor algebra in which we work. We use <sc>Matlab</sc> indexing notation throughout the paper, such as selecting the <italic>j</italic>-th column of a matrix via <bold>A</bold>(:, <italic>j</italic>) or <bold>A</bold><sub>:, <italic>j</italic></sub>.</p>
<sec>
<title>3.1 Tensor preliminaries</title>
<p>Let <inline-formula><mml:math id="M1"><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> be a real-valued, third-order tensor. Fixing the third-dimension, <italic>frontal slices</italic> <inline-formula><mml:math id="M2"><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula> are matrices for <italic>k</italic> &#x0003D; 1, &#x02026;, <italic>n</italic>. Fixing the second-dimension, <italic>lateral slices</italic> <inline-formula><mml:math id="M3"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> are matrices oriented along the third dimension for <italic>j</italic> &#x0003D; 1, &#x02026;, <italic>m</italic><sub>2</sub>. Fixing the first and second dimensions, <italic>tubes</italic> <inline-formula><mml:math id="M4"><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>a</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> are vectors oriented along the third dimension for <italic>i</italic> &#x0003D; 1, &#x02026;, <italic>m</italic><sub>1</sub> and <italic>j</italic> &#x0003D; 1, &#x02026;, <italic>m</italic><sub>2</sub>. We depict these partitions in <xref ref-type="fig" rid="F1">Figure 1</xref>. While this paper focuses on real-valued, third-order tensors (three indexes), we note all of the presented concepts generalize to higher-order and complex-valued tensors.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Tensor notation. <bold>(A)</bold> <inline-formula><mml:math id="M5"><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:math></inline-formula> is a third-order tensor, <bold>(B)</bold> <inline-formula><mml:math id="M6"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> are lateral slices, <bold>(C)</bold> <bold>A</bold><sup>(<italic>k</italic>)</sup> are frontal slices, and <bold>(D)</bold> <bold>a</bold><sub><italic>ij</italic></sub> are tubes.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0001.tif"/>
</fig>
<p>We interpret tensors as <italic>t-linear operators</italic> (Kilmer and Martin, <xref ref-type="bibr" rid="B23">2011</xref>; Kernfeld et al., <xref ref-type="bibr" rid="B20">2015</xref>). Through our operator lens, it is possible to define analogous matrix algebraic properties for tensors, such as orthogonality and rank. Thus, this framework has been described as <italic>matrix-mimetic</italic>. We describe the fundamental tools to understand how tensors operate for this paper, and refer the reader to Kilmer et al. (<xref ref-type="bibr" rid="B21">2013</xref>, <xref ref-type="bibr" rid="B22">2021</xref>) and Kernfeld et al. (<xref ref-type="bibr" rid="B20">2015</xref>) for details about the underlying algebra.</p>
<p>We define a product to apply matrices along the third dimension of a tensor (i.e., along the tubes).</p>
<p>Definition 3.1 (mode-3 product). Given <inline-formula><mml:math id="M7"><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> and <bold>M</bold> &#x02208; &#x0211D;<sup>&#x02113; &#x000D7; <italic>n</italic></sup>, the mode-3 product, denoted <inline-formula><mml:math id="M8"><mml:mover accent="false"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mo>&#x02261;</mml:mo><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle><mml:msub><mml:mrow><mml:mo>&#x000D7;</mml:mo></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mstyle mathvariant='bold'><mml:mtext>M</mml:mtext></mml:mstyle></mml:math></inline-formula>, outputs an <italic>m</italic><sub>1</sub>&#x000D7;<italic>m</italic><sub>2</sub>&#x000D7;&#x02113; tensor with entries</p>
<disp-formula id="E1"><mml:math id="M9"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:mover accent="false"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>M</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>for <italic>i</italic><sub>1</sub> &#x0003D; 1, &#x02026;, <italic>m</italic><sub>1</sub>, <italic>i</italic><sub>2</sub> &#x0003D; 1, &#x02026;, <italic>m</italic><sub>2</sub>, and <italic>k</italic> &#x0003D; 1, &#x02026;, &#x02113;.</p>
<p>The mode-3 product can be generalized along any mode; see Kolda and Bader (<xref ref-type="bibr" rid="B25">2009</xref>) for details.</p>
<p>Next, we define the facewise product to multiply the frontal slices of two third-order tensors in parallel.</p>
<p>Definition 3.2 (facewise product). Given <inline-formula><mml:math id="M10"><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mi>&#x02113;</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> and <inline-formula><mml:math id="M11"><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02113;</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, the facewise product, denoted <inline-formula><mml:math id="M12"><mml:mstyle mathvariant="bold-script"><mml:mi>C</mml:mi></mml:mstyle><mml:mo>&#x02261;</mml:mo><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle><mml:mo>&#x025B5;</mml:mo><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:math></inline-formula>, returns an <italic>m</italic><sub>1</sub>&#x000D7;<italic>m</italic><sub>2</sub>&#x000D7;<italic>n</italic> tensor where</p>
<disp-formula id="E2"><mml:math id="M13"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>C</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>B</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>for <italic>k</italic> &#x0003D; 1, &#x02026;, <italic>n</italic>.</p>
<p>Combining Definition 3.1 and Definition 3.2, we define our tensor operation, the &#x022C6;<sub><italic>M</italic></sub>-product, as follows:</p>
<p>Definition 3.3 (&#x022C6;<sub><italic>M</italic></sub>-product). Given <inline-formula><mml:math id="M14"><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mi>&#x02113;</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, <inline-formula><mml:math id="M15"><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02113;</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, and an invertible <italic>n</italic>&#x000D7;<italic>n</italic> matrix <bold>M</bold>, the &#x022C6;<sub><italic>M</italic></sub>-product outputs an <italic>m</italic><sub>1</sub>&#x000D7;<italic>m</italic><sub>2</sub>&#x000D7;<italic>n</italic> tensor of the following form:</p>
<disp-formula id="E3"><mml:math id="M16"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mo>&#x025B5;</mml:mo><mml:mover accent="false"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mo>&#x000D7;</mml:mo></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mi>M</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M17"><mml:mover accent="false"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mo>&#x02261;</mml:mo><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle><mml:msub><mml:mrow><mml:mo>&#x000D7;</mml:mo></mml:mrow><mml:mrow><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mstyle mathvariant='bold'><mml:mtext>M</mml:mtext></mml:mstyle></mml:math></inline-formula>.</p>
<p>We say that <inline-formula><mml:math id="M18"><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:math></inline-formula> and <inline-formula><mml:math id="M19"><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:math></inline-formula> live in the spatial domain and <inline-formula><mml:math id="M20"><mml:mover accent="false"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula> and <inline-formula><mml:math id="M21"><mml:mover accent="false"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula> live in the transform domain. We perform the facewise product in the transform domain, then return to the spatial domain by applying <bold>M</bold><sup>&#x02212;1</sup> along the tubes. If <bold>M</bold> is the identity matrix, the &#x022C6;<sub><italic>M</italic></sub>-product is exactly facewise product. If <bold>M</bold> were the discrete Fourier transformation matrix (DFT), we obtain the t-product (Kilmer and Martin, <xref ref-type="bibr" rid="B23">2011</xref>). In this case, the frontal slices of <inline-formula><mml:math id="M22"><mml:mover accent="false"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula> correspond to different frequencies in the Fourier domain and are therefore decoupled.</p>
<p>We can interpret the &#x022C6;<sub><italic>M</italic></sub>-product as a block-structured matrix product via</p>
<disp-formula id="E4"><label>(1)</label><mml:math id="M23"><mml:mrow><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mi>A</mml:mi></mml:mstyle><mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:mrow><mml:mo>&#x022C6;</mml:mo><mml:mtext>M</mml:mtext></mml:mrow></mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mi>B</mml:mi></mml:mstyle><mml:mo>&#x02261;</mml:mo><mml:munder><mml:munder><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>M</mml:mi></mml:mstyle><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>&#x02297;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>I</mml:mi></mml:mstyle><mml:mrow><mml:msub><mml:mi>m</mml:mi><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msup><mml:mrow><mml:mover accent='true'><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>A</mml:mi></mml:mstyle><mml:mo stretchy='true'>&#x0005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:msup><mml:mrow><mml:mover accent='true'><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>A</mml:mi></mml:mstyle><mml:mo stretchy='true'>&#x0005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>2</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mo>&#x022F1;</mml:mo></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:msup><mml:mrow><mml:mover accent='true'><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>A</mml:mi></mml:mstyle><mml:mo stretchy='true'>&#x0005E;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>M</mml:mi></mml:mstyle><mml:mo>&#x02297;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>I</mml:mi></mml:mstyle><mml:mi>&#x02113;</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mo stretchy='true'>&#x0FE38;</mml:mo></mml:munder><mml:mrow><mml:mtext>struct</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mi>A</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:munder><mml:munder><mml:munder><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>B</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>B</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>2</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x022EE;</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>B</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy='true'>&#x0FE38;</mml:mo></mml:munder><mml:mrow><mml:mtext>unfold</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mi>B</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:munder></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math></disp-formula>
<p>where &#x02297; is the Kronecker product (Petersen and Pedersen, <xref ref-type="bibr" rid="B45">2012</xref>). The block matrix structure, <inline-formula><mml:math id="M24"><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">struct</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, depends on the choice of transformation, <bold>M</bold>. We consider two familiar examples: the facewise product (<bold>M</bold> &#x0003D; <bold>I</bold><sub><italic>n</italic></sub>) and the t-product (<bold>M</bold> &#x0003D; <bold>F</bold><sub><italic>n</italic></sub>, the DFT matrix):</p>
<disp-formula id="E5"><label>(2a)</label><mml:math id="M25"><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>M</mml:mi></mml:mstyle><mml:mo>=</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>I</mml:mi></mml:mstyle><mml:mi>n</mml:mi></mml:msub></mml:mrow></mml:mtd><mml:mtd columnalign='left'><mml:mrow><mml:mtext>struct</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mi>A</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mtext>bdiag</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mi>A</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>A</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>1</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>A</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mn>2</mml:mn><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mo>&#x022F1;</mml:mo></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>A</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mi>n</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:msup></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E6"><label>(2b)</label><mml:math id="M26"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant='bold'><mml:mtext>M</mml:mtext></mml:mstyle><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>F</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mtd><mml:mtd><mml:mtext class="textrm" mathvariant="normal">struct</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">bcirc</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtable style="text-align:axis;" equalrows="false" columnlines="none none none none none none none none none" equalcolumns="false" class="array"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mtd><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mtd><mml:mtd><mml:mo>&#x022EF;</mml:mo></mml:mtd><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mtd><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mtd><mml:mtd><mml:mo>&#x022EF;</mml:mo></mml:mtd><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>3</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x022EE;</mml:mo></mml:mtd><mml:mtd><mml:mo>&#x022EE;</mml:mo></mml:mtd><mml:mtd><mml:mo>&#x022F1;</mml:mo></mml:mtd><mml:mtd><mml:mo>&#x022EE;</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mtd><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mtd><mml:mtd><mml:mo>&#x022EF;</mml:mo></mml:mtd><mml:mtd><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>While we never explicitly form <xref ref-type="disp-formula" rid="E5">Equation (2)</xref>, the block structure will be helpful for subsequent analysis.</p>
</sec>
<sec>
<title>3.2 Matrix-mimetic tensor algebra</title>
<p>The &#x022C6;<sub><italic>M</italic></sub>-product yields a well-defined algebraic structure. Specifically, suppose we have tubes <bold>a</bold>, <bold>b</bold>&#x02208;&#x0211D;<sup>1 &#x000D7; 1 &#x000D7; <italic>n</italic></sup>. Then,</p>
<p><graphic xlink:href="fdata-07-1363978-e0001.tif"/></p>
<p>where <inline-formula><mml:math id="M29"><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">vec</mml:mtext></mml:mstyle><mml:mo>:</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup><mml:mi>&#x02192;</mml:mi><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> turns a tube into a column vector. The &#x022C6;<sub><italic>M</italic></sub>-product of tubes (3) is equivalent to post-multiplying by the structured matrix, <bold>R</bold>[<bold>b</bold>]. Note that <bold>R</bold>[&#x000B7;] implicitly depends on the choice of <bold>M</bold>, but we omit explicitly writing this dependence for notational simplicity. The tubes form a matrix subalgebra which dictates the algebraic structure imposed on the high-dimensional space (Kernfeld et al., <xref ref-type="bibr" rid="B20">2015</xref>). As a result, the &#x022C6;<sub><italic>M</italic></sub>-product is <italic>matrix-mimetic</italic> and yields several familiar concepts.</p>
<p>Definition 3.4 (&#x022C6;<sub><italic>M</italic></sub>-identity tube). The identity tube <bold>e</bold> &#x02208; &#x0211D;<sup>1 &#x000D7; 1 &#x000D7; <italic>n</italic></sup> under the &#x022C6;<sub><italic>M</italic></sub>-product is</p>
<disp-formula id="E8"><mml:math id="M30"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant='bold'><mml:mtext>e</mml:mtext></mml:mstyle><mml:mo>=</mml:mo><mml:mstyle mathvariant='bold'><mml:mn>1</mml:mn></mml:mstyle><mml:mo>&#x000D7;</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>M</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <bold>1</bold> is the 1 &#x000D7; 1 &#x000D7; <italic>n</italic> tube containing all ones.</p>
<p>This gives rise to the notion of an identity tensor.</p>
<p>Definition 3.5 (&#x022C6;<sub><italic>M</italic></sub>-identity tensor). A tensor <inline-formula><mml:math id="M31"><mml:mstyle mathvariant="bold-script"><mml:mi>I</mml:mi></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>m</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is the identity tensor if <inline-formula><mml:math id="M32"><mml:mstyle mathvariant="bold-script"><mml:mi>I</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mo>:</mml:mo></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>e</mml:mtext></mml:mstyle></mml:math></inline-formula> for <italic>i</italic> &#x0003D; 1, &#x02026;, <italic>m</italic> where <bold>e</bold> is the &#x022C6;<sub><italic>M</italic></sub>-identity tube.</p>
<p>Note that if the size of third dimension is equal to one (i.e., <italic>n</italic> &#x0003D; 1), then Definition 3.5 collapses into the identity matrix. This is a hallmark of our &#x022C6;<sub><italic>M</italic></sub>-framework and matrix-mimeticity; the product and definitions reduce to the equivalent matrix definitions when the third dimension is removed.</p>
<p>Definition 3.6 (&#x022C6;<sub><italic>M</italic></sub>-transpose). Given <inline-formula><mml:math id="M33"><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, its transpose <inline-formula><mml:math id="M34"><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle><mml:mo>&#x02261;</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>A</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is</p>
<disp-formula id="E9"><mml:math id="M35"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>B</mml:mtext></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>A</mml:mtext></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>H</mml:mi></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>for <italic>k</italic> &#x0003D; 1, &#x02026;, <italic>n</italic>.</p>
<p>Note that if our transformation <bold>M</bold> is complex-valued, the transpose operator in the transform domain performs the conjugate transpose. However, because we are working with real-valued tensors in the spatial domain, the transpose will be real-valued as well.</p></sec>
</sec>
<sec id="s4">
<title>4 Tensor neural networks (t-NNs)</title>
<p>In general, neural networks are parameterized mappings from an input space <inline-formula><mml:math id="M36"><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula> to the target space <inline-formula><mml:math id="M37"><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>C</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula>. These mappings are composite functions of the form</p>
<disp-formula id="E10"><label>(4)</label><mml:math id="M38"><mml:mrow><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mtext>NN</mml:mtext></mml:mrow></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mo>&#x000B7;</mml:mo><mml:mo>,</mml:mo><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003B8;</mml:mi></mml:mstyle><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x02261;</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mi>d</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mo>&#x022EF;</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mo>&#x000B7;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003B8;</mml:mi></mml:mstyle><mml:mn>1</mml:mn></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003B8;</mml:mi></mml:mstyle><mml:mn>2</mml:mn></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>&#x022EF;</mml:mo><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:msub><mml:mstyle mathvariant='bold-italic' mathsize='normal'><mml:mi>&#x003B8;</mml:mi></mml:mstyle><mml:mi>d</mml:mi></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>.</mml:mo></mml:mrow></mml:math></disp-formula>
<p>Each subfunction <italic>f</italic><sub><italic>j</italic></sub>(&#x000B7;, <italic><bold>&#x003B8;</bold></italic><sub><italic>j</italic></sub>) for <italic>j</italic> &#x0003D; 1, &#x02026;, <italic>d</italic> is called a <italic>layer</italic>. The goal is to find a good set of weights <italic><bold>&#x003B8;</bold></italic> &#x02261; (<italic><bold>&#x003B8;</bold></italic><sub>1</sub>, &#x02026;, <italic><bold>&#x003B8;</bold></italic><sub><italic>d</italic></sub>) &#x02208; &#x00398; such that <italic>F</italic><sub>NN</sub>(<bold>y</bold>, <italic><bold>&#x003B8;</bold></italic>) &#x02248; <italic>c</italic> for all input-target pairs <inline-formula><mml:math id="M39"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:mi>c</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02282;</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>D</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula>. Here, &#x00398; is the parameter space and <inline-formula><mml:math id="M40"><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>D</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02282;</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>C</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula> is the data space.</p>
<p>The most common layer of feed forward neural networks (<xref ref-type="disp-formula" rid="E10">4</xref>) consists of an affine transformation and pointwise nonlinearity of the form</p>
<disp-formula id="E11"><label>(5)</label><mml:math id="M41"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>&#x003B8;</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>W</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>b</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M42"><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>W</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula> is a weight matrix, <inline-formula><mml:math id="M43"><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>b</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:math></inline-formula> is a bias vector, and <inline-formula><mml:math id="M44"><mml:msub><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:mi>&#x0211D;</mml:mi><mml:mi>&#x02192;</mml:mi><mml:mi>&#x0211D;</mml:mi></mml:math></inline-formula> is a one-dimensional nonlinear activation function, applied entrywise. In practice, activation functions are monotonic, such as the sigmoid function, &#x003C3;(<italic>x</italic>) &#x0003D; 1/(1&#x0002B;<italic>e</italic><sup>&#x02212;<italic>x</italic></sup>), or Rectified Linear Unit (ReLU), &#x003C3;(<italic>x</italic>) &#x0003D; max(<italic>x</italic>, 0). We call <bold>y</bold><sub><italic>j</italic></sub> the <italic>features</italic> of layer <italic>j</italic> and <inline-formula><mml:math id="M45"><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula> are the input features. Notationally, we use <italic><bold>&#x003B8;</bold></italic><sub><italic>j</italic></sub> &#x02261; (<bold>W</bold><sub><italic>j</italic></sub>, <bold>b</bold><sub><italic>j</italic></sub>) to collect all of the learnable weights for layer <italic>j</italic>.</p>
<sec>
<title>4.1 Improved parameterization with the &#x022C6;<sub><italic>M</italic></sub>-product</title>
<p>When designing a neural network, we seek to balance a simple parameter space with an expressive feature space. However, traditionally fully-connected layers like (<xref ref-type="disp-formula" rid="E11">5</xref>) use parameters in a highly inefficient manner. We propose new tensor fully-connected layers for a more efficient parametrization while still creating a rich feature space. Specifically, we consider the following tensor forward propagation scheme:</p>
<disp-formula id="E12"><label>(6)</label><mml:math id="M46"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>for <italic>j</italic> &#x0003D; 1, &#x02026;, <italic>d</italic>. Suppose our input features <inline-formula><mml:math id="M47"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula> is of size <italic>m</italic><sub>0</sub>&#x000D7;1 &#x000D7; <italic>n</italic>. Here, the weight tensor <inline-formula><mml:math id="M48"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is of size <italic>m</italic><sub><italic>j</italic>&#x0002B;1</sub>&#x000D7;<italic>m</italic><sub><italic>j</italic></sub>&#x000D7;<italic>n</italic> and the bias <inline-formula><mml:math id="M49"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is of size <italic>m</italic><sub><italic>j</italic>&#x0002B;1</sub>&#x000D7;1 &#x000D7; <italic>n</italic>. The forward propagation through <xref ref-type="disp-formula" rid="E12">Equation (6)</xref> results in a tensor neural network <italic>F</italic><sub>tnn</sub>(&#x000B7;, <italic><bold>&#x003B8;</bold></italic>) where <inline-formula><mml:math id="M50"><mml:mstyle mathvariant='bold'><mml:mtext>&#x003B8;</mml:mtext></mml:mstyle><mml:mo> &#x02261; </mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>.</p>
<p>Through the illustration in <xref ref-type="fig" rid="F2">Figure 2</xref>, we depict the number of weight parameters required to preserve the size of our feature space using either a dense matrix (<xref ref-type="fig" rid="F2">Figure 2A</xref>) or a dense tensor under the &#x022C6;<sub><italic>M</italic></sub>-product (<xref ref-type="fig" rid="F2">Figure 2B</xref>). Using the &#x022C6;<sub><italic>M</italic></sub>-algebra, we can reduce the number of weight parameters by a factor of <italic>n</italic> while maintaining the same number of features (i.e., maintaining a rich feature space). Beyond the parametric advantages, the multilinear &#x022C6;<sub><italic>M</italic></sub>-product incorporates the structure of the data into the features. This enables our t-NNs to extract more meaningful features, and hence improve the richness of our feature space.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Comparison of network parameterizations to preserve the number of features of layer <italic>j</italic>&#x02212;1 and layer <italic>j</italic>. The matrix mapping requires <italic>n</italic><sup>4</sup> weights in <bold>W</bold><sub><italic>j</italic></sub> and tensor mapping requires <italic>n</italic><sup>3</sup> weights in <inline-formula><mml:math id="M51"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>. <bold>(A)</bold> Matrix linear mapping. <bold>(B)</bold> Tensor &#x022C6;<sub><italic>M</italic></sub>-mapping.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0002.tif"/>
</fig>
</sec>
<sec>
<title>4.2 The training problem</title>
<p>Training a (tensor) neural network is posed as a stochastic optimization problem given by</p>
<disp-formula id="E13"><label>(7)</label><mml:math id="M52"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mtext>&#x003B8;</mml:mtext></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:mi>&#x00398;</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mtext>&#x000A0;</mml:mtext><mml:mi>&#x1D53C;</mml:mi><mml:mi>L</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">tnn</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo class="qopname">&#x02192;</mml:mo></mml:mover><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold-italic"><mml:mtext>&#x003B8;</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mi>c</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003BB;</mml:mi><mml:mi>R</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mtext>&#x003B8;</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M53"><mml:mi>L</mml:mi><mml:mo>:</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup><mml:mo>&#x000D7;</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>C</mml:mi></mml:mstyle></mml:mrow><mml:mi>&#x02192;</mml:mi><mml:mi>&#x0211D;</mml:mi></mml:math></inline-formula> is the loss function that measures the misfit between the network prediction and the true target. The expectation is taken over all input-target pairs <inline-formula><mml:math id="M54"><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo>,</mml:mo><mml:mi>c</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>D</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula>. The additional function <inline-formula><mml:math id="M55"><mml:mi>R</mml:mi><mml:mo>:</mml:mo><mml:mi>&#x00398;</mml:mi><mml:mi>&#x02192;</mml:mi><mml:mi>&#x0211D;</mml:mi></mml:math></inline-formula> regularizes the weights to promote desirable properties (e.g., smoothness), weighted by a regularization parameter &#x003BB; &#x0003E; 0.</p>
</sec>
<sec>
<title>4.3 Tubal loss (t-loss) functions</title>
<p>The loss function is chosen based on the given task. For regression, we often use mean squared error, and for classification, which is the focus of this paper, we often use cross entropy. Cross entropy loss, related to the Kullback&#x02013;Leibler (KL) divergence (Kullback and Leibler, <xref ref-type="bibr" rid="B29">1951</xref>), measures the distance between two probability distributions. In practice, we first transform the network outputs into a set of probabilities using exponential normalization. Specifically, we use the softmax function <inline-formula><mml:math id="M56"><mml:mi>h</mml:mi><mml:mo>:</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msup><mml:mi>&#x02192;</mml:mi><mml:msup><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, defined entrywise as</p>
<disp-formula id="E14"><label>(8)</label><mml:math id="M57"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mstyle displaystyle="false"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mtext>&#x02003;&#x000A0;</mml:mtext><mml:mtext class="textrm" mathvariant="normal">for</mml:mtext><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:mi>p</mml:mi><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where &#x00394;<sup><italic>p</italic></sup> is the <italic>p</italic>-dimensional unit simplex.</p>
<p>To preserve the algebraic integrity of t-NNs, we introduce a tubal variant of the softmax function. Drawing inspiration from Lund (<xref ref-type="bibr" rid="B33">2020</xref>), a tubal function, we start by defining tubal functions generally.</p>
<p>Definition 4.1 (tubal function). Given <bold>b</bold> &#x02208; &#x0211D;<sup>1 &#x000D7; 1 &#x000D7; <italic>n</italic></sup>, a <italic>tubal function</italic> <inline-formula><mml:math id="M58"><mml:mi>f</mml:mi><mml:mo>:</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup><mml:mi>&#x02192;</mml:mi><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> acts on the action of <bold>b</bold> under the &#x022C6;<sub><italic>M</italic></sub>-product; that is,</p>
<disp-formula id="E15"><label>(9)</label><mml:math id="M59"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle displaystyle="true"><mml:munder><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>b</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mtext>tubal function</mml:mtext></mml:mrow></mml:munder></mml:mstyle><mml:mo> &#x02261; </mml:mo><mml:mstyle displaystyle="true"><mml:munder><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>R</mml:mtext></mml:mstyle><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>b</mml:mtext></mml:mstyle></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mtext>matrix function</mml:mtext></mml:mrow></mml:munder></mml:mstyle><mml:mo> &#x02261; </mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>M</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup><mml:mstyle displaystyle="true"><mml:munder><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>M</mml:mtext></mml:mstyle><mml:mtext class="textrm" mathvariant="normal">vec</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>b</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mtext>pointwise function</mml:mtext></mml:mrow></mml:munder></mml:mstyle><mml:msup><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>M</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>In practice, a tubal function is applied pointwise in the transform domain.</p>
<p>We note that the pointwise function in Definition 4.1 is equivalent to applying a matrix function to the eigenvalues of the matrix <bold>R</bold>[<bold>b</bold>]. We provide a visualization of the effects of tubal functions compared to applying an entry-wise function in Example 4.2.</p>
<p>Example 4.2 (Visualizations of tubal functions). Consider the following RGB image <inline-formula><mml:math id="M60"><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>150</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>169</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> of a Tufts Community Apeal elephant (TCA, <xref ref-type="bibr" rid="B51">2023</xref>), where 3 is the number of color channels (<xref ref-type="fig" rid="F3">Figure 3A</xref>). We rescale each entry of <inline-formula><mml:math id="M61"><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:math></inline-formula> between 0 to 1 and consider the following transformation matrix:</p>
<disp-formula id="E16"><label>(10)</label><mml:math id="M62"><mml:mrow><mml:mtable columnalign='left'><mml:mtr columnalign='left'><mml:mtd columnalign='left'><mml:mrow><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>M</mml:mi></mml:mstyle><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mtd><mml:mtd><mml:mn>1</mml:mn></mml:mtd><mml:mtd><mml:mn>1</mml:mn></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mn>0</mml:mn></mml:mtd><mml:mtd><mml:mn>1</mml:mn></mml:mtd><mml:mtd><mml:mn>1</mml:mn></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mn>0</mml:mn></mml:mtd><mml:mtd><mml:mn>0</mml:mn></mml:mtd><mml:mtd><mml:mn>1</mml:mn></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>&#x02003;</mml:mo><mml:mtext>and</mml:mtext><mml:mo>&#x02003;</mml:mo><mml:msup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>M</mml:mi></mml:mstyle><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mtd><mml:mtd><mml:mn>1</mml:mn></mml:mtd><mml:mtd><mml:mn>0</mml:mn></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mn>0</mml:mn></mml:mtd><mml:mtd><mml:mn>1</mml:mn></mml:mtd><mml:mtd><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mn>0</mml:mn></mml:mtd><mml:mtd><mml:mn>0</mml:mn></mml:mtd><mml:mtd><mml:mn>1</mml:mn></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math></disp-formula>
<p>To illustrate the effect of using tubal functions, we compare applying various functions as pointwise functions (i.e., independent of <bold>M</bold>) and as tubal functions using <xref ref-type="disp-formula" rid="E15">Equation (9)</xref> in <xref ref-type="fig" rid="F3">Figure 3B</xref>.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Comparison of entry-wise and tubal functions applied to roughly piecewise constant image. <bold>(A)</bold> Image and color channels of elephant image. The elephant is almost piecewise constant with easily distinguishable RGB values. <bold>(B)</bold> Effects of tubal functions under the transformation (<xref ref-type="disp-formula" rid="E16">10</xref>). Images are rescaled between 0 and 1 after applying the respective function.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0003.tif"/>
</fig>
<p>Tubal functions are able to capture shared patterns among the tubes that entrywise operators ignore. Each choice of tubal function highlights different features of the image, such as the body with <italic>f</italic>(<italic>x</italic>) &#x0003D; max(<italic>x</italic>, 0) and the heart with <italic>f</italic>(<italic>x</italic>) &#x0003D; tanh(<italic>x</italic>). In comparison, the entrywise counterparts do not bring new insights or structure. A striking difference occurs for <italic>f</italic>(<italic>x</italic>) &#x0003D; sign(<italic>x</italic>). The entrywise operator turns all pixels white because the RGB entries are all nonnegative. In contrast, the tubal equivalent is applied in the transform domain, and hence reveals meaningful features of the image.</p>
<p>Note that the results depend on the chosen tubal function and transform <bold>M</bold>. We deliberately chose <bold>M</bold> to emphasize green and blue channels and diminish the effect of the red channel, enabling easy-to-interpret distinctions between tubal and entrywise functions.</p>
<p>We can now define a tubal softmax function based on the traditional softmax function in (<xref ref-type="disp-formula" rid="E14">8</xref>).</p>
<p>Definition 4.3 (tubal softmax). Consider a lateral slice <inline-formula><mml:math id="M63"><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> as a <italic>p</italic>&#x000D7;1 vector of 1 &#x000D7; 1 &#x000D7; <italic>n</italic> tubes. Then, the tubal softmax function <inline-formula><mml:math id="M64"><mml:mi>h</mml:mi><mml:mo>:</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup><mml:mi>&#x02192;</mml:mi><mml:msup><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> performs the following mapping:</p>
<disp-formula id="E17"><mml:math id="M65"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>for <italic>i</italic> &#x0003D; 1, &#x02026;, <italic>p</italic> where <inline-formula><mml:math id="M66"><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> and the exponential functions are applied as tubal functions. Here, &#x00394;<sup><italic>p</italic>&#x000D7;1 &#x000D7; <italic>n</italic></sup> is the tubal-equivalent of <italic>p</italic>-dimensional unit simplex.</p>
<p>We interpret <inline-formula><mml:math id="M67"><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> as a vector of &#x0201C;tubal probabilities&#x0201D; in that the tubes sum to the identity tube</p>
<disp-formula id="E18"><mml:math id="M68"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:mstyle displaystyle="false"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:msub><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mstyle displaystyle="false"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle displaystyle="false"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x02003;</mml:mo><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle displaystyle="false"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mstyle displaystyle="false"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mo class="qopname">exp</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>&#x02003;</mml:mo><mml:mo>=</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>e</mml:mtext></mml:mstyle><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Through Definition 4.3, we demonstrate the parallels between tubal functions and traditional functions; the similarities are a direct consequence of a matrix-mimetic tensor framework.</p>
<p>The last step to define the tubal cross entropy function that converts the output of tubal function to a scalar. Recall, traditional cross entropy <inline-formula><mml:math id="M69"><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">ce</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msup><mml:mo>&#x000D7;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:mi>p</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mi>&#x02192;</mml:mi><mml:msub><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x0002B;</mml:mo></mml:mrow></mml:msub></mml:math></inline-formula> for one sample is given by</p>
<disp-formula id="E19"><label>(11)</label><mml:math id="M70"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">ce</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>x</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:mi>c</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mo class="qopname">log</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>x</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <bold>x</bold> is the output of the network, <italic>c</italic> is the corresponding target class, and <italic>h</italic> is the softmax function. We generalize (<xref ref-type="disp-formula" rid="E19">11</xref>) to a tubal variant as follows:</p>
<p>Definition 4.4 (tubal cross entropy (t-cross entropy)). The <italic>tubal cross entropy function</italic> <inline-formula><mml:math id="M71"><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">tce</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:msup><mml:mrow><mml:mi>&#x00394;</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup><mml:mo>&#x000D7;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:mi>p</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mi>&#x02192;</mml:mi><mml:msub><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mo>&#x0002B;</mml:mo></mml:mrow></mml:msub></mml:math></inline-formula> is given by</p>
<disp-formula id="E20"><mml:math id="M72"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">tce</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo>,</mml:mo><mml:mi>c</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mo>||</mml:mo><mml:mtext class="textrm" mathvariant="normal">vec</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo class="qopname">log</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo class="qopname">&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>:</mml:mo></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mrow><mml:mo>||</mml:mo></mml:mrow><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>h</italic> is the tubal softmax function, log is applied as a tubal function, <italic>c</italic> is the index corresponding to the target class, and ||&#x000B7;||<sub><italic>q</italic></sub> is a vector norm.</p>
<p>The intuition behind Definition 4.4 is the following. If we have good features <inline-formula><mml:math id="M73"><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:math></inline-formula>, then <inline-formula><mml:math id="M74"><mml:msub><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>:</mml:mo></mml:mrow></mml:msub><mml:mo>&#x02248;</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>e</mml:mtext></mml:mstyle></mml:math></inline-formula>, the identity tube, and the remaining tubes will be closer to 0. In the transform domain, <inline-formula><mml:math id="M75"><mml:msub><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>M</mml:mtext></mml:mstyle></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>:</mml:mo></mml:mrow></mml:msub><mml:mo>&#x02248;</mml:mo><mml:mstyle mathvariant='bold'><mml:mn>1</mml:mn></mml:mstyle></mml:math></inline-formula> and the remaining entries will be close to zero. When we apply the log pointwise in the transform domain, the tube <inline-formula><mml:math id="M76"><mml:mo class="qopname">log</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo class="qopname">&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>M</mml:mtext></mml:mstyle></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>:</mml:mo></mml:mrow></mml:msub><mml:mo>&#x02248;</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula> and the remaining entries will be large negative numbers. As a result, <italic>L</italic><sub>tce</sub> is smallest when the <inline-formula><mml:math id="M77"><mml:msub><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>:</mml:mo></mml:mrow></mml:msub><mml:mo>&#x02248;</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>e</mml:mtext></mml:mstyle></mml:math></inline-formula>, as desired.</p>
<p>In practice, if <italic>q</italic>-norm corresponds to a finite integer, we can instead use <inline-formula><mml:math id="M78"><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:mo class="qopname">log</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mi>h</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:mrow><mml:mo class="qopname">&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>:</mml:mo></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:msubsup><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mi>q</mml:mi></mml:mrow><mml:mrow><mml:mi>q</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> for t-cross entropy for easier derivative computations. For numerical benefits when training, we consider normalized versions based on the number of tubal entries, e.g., multiply by 1/<italic>n</italic>. These suggested modifications should not change performance in theory, but could change preferred training hyperparameters.</p>
</sec>
<sec>
<title>4.4 Backward propagation with t-NNs</title>
<p>The workhorse of neural network training is backward propagation (Rumelhart et al., <xref ref-type="bibr" rid="B47">1986</xref>; Bengio et al., <xref ref-type="bibr" rid="B2">1994</xref>; Shalev-Shwartz et al., <xref ref-type="bibr" rid="B48">2017</xref>; Nielsen, <xref ref-type="bibr" rid="B40">2018</xref>), a method to calculate the gradient of the objective function (<xref ref-type="disp-formula" rid="E13">7</xref>) with respect to the weights. With gradient information, one can apply standard stochastic gradient optimization techniques to train.</p>
<p>In the &#x022C6;<sub><italic>M</italic></sub>-framework, for an orthogonal transformation <bold>M</bold>, the backpropagation formulas are analogous to the matrix case. For example, the derivatives of the &#x022C6;<sub><italic>M</italic></sub>-product are</p>
<disp-formula id="E21"><label>(12)</label><mml:math id="M79"><mml:mtable class="eqnarray" columnalign="right"><mml:mtr><mml:mtd><mml:mfrac><mml:mrow><mml:mi>&#x02202;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow></mml:mfrac><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:mi>&#x02202;</mml:mi><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">and</mml:mtext><mml:mo>&#x02003;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi>&#x02202;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02202;</mml:mi><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow></mml:mfrac><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x02202;</mml:mi><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M81"><mml:mi>&#x02202;</mml:mi><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:math></inline-formula> indicates a direction or perturbation of the same size as <inline-formula><mml:math id="M82"><mml:mstyle mathvariant="bold-script"><mml:mi>X</mml:mi></mml:mstyle></mml:math></inline-formula>. For full details on the derivation, we refer the reader to Newman (<xref ref-type="bibr" rid="B37">2019</xref>).</p>
<p>The simplicity of the back-propagation formulas (<xref ref-type="disp-formula" rid="E21">12</xref>) is one of the hallmarks of our choice of leveraging a matrix-mimetic tensor framework. Other tensor-based neural network designs (Wang et al., <xref ref-type="bibr" rid="B54">2023</xref>) often require complicated indexing and non-traditional notation which, in addition to being cumbersome, can preclude extending more sophisticated neural network architectures to higher dimensions. The &#x022C6;<sub><italic>M</italic></sub>-framework yields derivative formulations that are easy to interpret, implement, and analyze.</p></sec>
</sec>
<sec id="s5">
<title>5 Stable t-NNs</title>
<p>As the depth (number of layers) of a network increases, gradient-based training is subject to numerical instability such as vanishing or exploding gradient problem (Bengio et al., <xref ref-type="bibr" rid="B2">1994</xref>; Shalev-Shwartz et al., <xref ref-type="bibr" rid="B48">2017</xref>). To avoid these instabilities, one can interpret deep neural networks as discretizations of differential equations (Ee, <xref ref-type="bibr" rid="B12">2017</xref>; Haber and Ruthotto, <xref ref-type="bibr" rid="B13">2017</xref>; Haber et al., <xref ref-type="bibr" rid="B14">2018</xref>) and analyze the stability of forward propagation as well as the well-posedness of the learning problem; i.e., whether the classifying function depend continuously on the initialization of the parameters (Ascher, <xref ref-type="bibr" rid="B1">2010</xref>). By ensuring stability and well-posedness, networks can generalize better to similar data and can classify data more robustly.</p>
<p>We emphasize that the notion of stability is related to the formal numerical analysis definition in the Lyapunov sense (i.e., stability of dynamical systems). This is a property of the model itself and independent of the data. From a statistical and foundational learning theory perspective, neural networks are typically over-parameterized models, which tend to overfit. In this context, stability can promote better generalization by imposing constraints of the structure of the weight matrices, effectively reducing the number of degrees of freedom. The tensorial structure imposes additional constraints and further reduces the number of parameters, which can again lead to better generalization.</p>
<sec>
<title>5.1 Well-posed learning problem criterion</title>
<p>Consider the residual neural network (He et al., <xref ref-type="bibr" rid="B17">2016</xref>) with tensor operations, given by</p>
<disp-formula id="E22"><label>(13)</label><mml:math id="M83"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>h</mml:mi><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x02003;&#x000A0;</mml:mtext><mml:mtext class="textrm" mathvariant="normal">for</mml:mtext><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:mi>d</mml:mi><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M84"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, <inline-formula><mml:math id="M85"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>m</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>, and <inline-formula><mml:math id="M86"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>. With the addition of the step size <italic>h</italic>, we can interpret <xref ref-type="disp-formula" rid="E22">Equation (13)</xref> as a forward Euler discretization of the continuous ordinary differential equation (ODE)</p>
<disp-formula id="E23"><label>(14)</label><mml:math id="M87"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mfrac><mml:mrow><mml:mi>d</mml:mi><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x02003;&#x000A0;</mml:mtext><mml:mtext class="textrm" mathvariant="normal">with&#x02003;&#x000A0;</mml:mtext><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>for all <italic>t</italic> &#x02208; [0, <italic>T</italic>] where <italic>T</italic> is the final time corresponding to the depth of the discretized network.</p>
<p>The stability of non-autonomous ODEs like (<xref ref-type="disp-formula" rid="E23">14</xref>) depends on the eigenvalues of the Jacobian with repsect to the features. To perform analogous analysis for tensor operators, it is useful to consider the equivalent block matrix version of the &#x022C6;<sub><italic>M</italic></sub>-product in <xref ref-type="disp-formula" rid="E4">Equation (1)</xref> where <inline-formula><mml:math id="M88"><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo> &#x02261; </mml:mo><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">struct</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">unfold</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. It follows that we can matricize (<xref ref-type="disp-formula" rid="E23">14</xref>) via</p>
<disp-formula id="E24"><label>(15)</label><mml:math id="M89"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mfrac><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:mtext class="textrm" mathvariant="normal">unfold</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext class="textrm" mathvariant="normal">struct</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext class="textrm" mathvariant="normal">unfold</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">unfold</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>The Jacobian of the matricized system (<xref ref-type="disp-formula" rid="E24">15</xref>) with respect to <inline-formula><mml:math id="M90"><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">unfold</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is</p>
<disp-formula id="E25"><mml:math id="M91"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:mstyle mathvariant='bold'><mml:mtext>J</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mtext class="textrm" mathvariant="normal">diag</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x02032;</mml:mi></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>x</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext class="textrm" mathvariant="normal">struct</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M92"><mml:mstyle mathvariant='bold'><mml:mtext>x</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">struct</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">unfold</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">unfold</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> and <bold>J</bold>(<italic>t</italic>) &#x02208; &#x0211D;<sup><italic>mn</italic>&#x000D7;<italic>mn</italic></sup>. In most cases, the activation function &#x003C3; is non-decreasing, and thus the entries in diag(&#x003C3;&#x02032;(<bold>x</bold>(<italic>t</italic>))) are nonnegative. As a result, the stability and well-posedness of the ODE relies on the eigenvalues of <inline-formula><mml:math id="M93"><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">struct</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. As described in Haber and Ruthotto (<xref ref-type="bibr" rid="B13">2017</xref>), the learning problem is well-posed if</p>
<disp-formula id="E26"><label>(16)</label><mml:math id="M94"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Re</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext class="textrm" mathvariant="normal">struct</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02248;</mml:mo><mml:mn>0</mml:mn><mml:mtext>&#x02003;&#x000A0;</mml:mtext><mml:mtext class="textrm" mathvariant="normal">for</mml:mtext><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x02026;</mml:mo><mml:mo>,</mml:mo><mml:mi>m</mml:mi><mml:mi>n</mml:mi><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where &#x003BB;<sub><italic>i</italic></sub>(<bold>A</bold>) is the <italic>i</italic>-th eigenvalue of <bold>A</bold>. This criterion implies that the imaginary part of the eigenvalues drive the dynamics, promoting rotational movement of features. This produces stable forward propagation that avoids features diverging and prevents inputs from distinct classes from converging to indistinguishable points; the latter would lead to ill-posed back propagation and hence an ill-posed learning problem.</p>
<p>We can be more concrete about the eigenvalues because in <xref ref-type="disp-formula" rid="E4">Equation (1)</xref>, <inline-formula><mml:math id="M95"><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">struct</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is block-diagonalized in the transform domain. Thus, we can equivalently write <xref ref-type="disp-formula" rid="E26">Equation (16)</xref> as follows:</p>
<disp-formula id="E27"><label>(17)</label><mml:math id="M96"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Re</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x003BB;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mover accent="false"><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>W</mml:mtext></mml:mstyle></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02248;</mml:mo><mml:mn>0</mml:mn></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>for <italic>i</italic> &#x0003D; 1, &#x02026;, <italic>m</italic> and <italic>k</italic> &#x0003D; 1, &#x02026;, <italic>n</italic>. In short, if the eigenvalues of each frontal slice in the transform domain have a real part close to zero, the t-NN learning problem will be well-posed, save one more requirement.</p>
<p>A subtle, yet important requirement for stability is that the weights change gradually over time (i.e., layers). This ensures that small perturbations of the weights yield small perturbations of the features. To promote this desired behavior, we imposed a smoothing regularizer in (discrete) time via</p>
<disp-formula id="E28"><label>(18)</label><mml:math id="M97"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>R</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">smooth</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>d</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:munderover></mml:mstyle><mml:mo>||</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mrow><mml:mo>||</mml:mo></mml:mrow><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:mo>||</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mrow><mml:mo>||</mml:mo></mml:mrow><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
<sec>
<title>5.2 Hamiltonian-inspired stable t-NNs</title>
<p>While theoretically useful, it is impractical to evaluate the eigenvalues of the weight tensors to satisfy the well-posedness condition (<xref ref-type="disp-formula" rid="E27">17</xref>) as we train a network. Instead, motivated by the presentation in Haber and Ruthotto (<xref ref-type="bibr" rid="B13">2017</xref>), we implement a forward propagation that inherently satisfies (<xref ref-type="disp-formula" rid="E27">17</xref>) independent of the weights. This forward propagation scheme is inspired by Hamiltonian dynamics, which we briefly describe and refer to Ascher (<xref ref-type="bibr" rid="B1">2010</xref>) and Brooks et al. (<xref ref-type="bibr" rid="B4">2011</xref>) for further details. We define a Hamiltonian as follows:</p>
<p>Definition 5.1 (Hamiltonian). Let <inline-formula><mml:math id="M98"><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>y</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>, <inline-formula><mml:math id="M99"><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>z</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>, and <italic>t</italic> &#x02208; [0, <italic>T</italic>]. A Hamiltonian <inline-formula><mml:math id="M100"><mml:mi>H</mml:mi><mml:mo>:</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>y</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>&#x000D7;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>m</mml:mi></mml:mrow><mml:mrow><mml:mi>z</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>&#x000D7;</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mi>T</mml:mi></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mi>&#x02192;</mml:mi><mml:mi>&#x0211D;</mml:mi></mml:math></inline-formula> is a system governed by the following dynamics:</p>
<disp-formula id="E29"><mml:math id="M101"><mml:mtable columnalign="left"><mml:mtr><mml:mtd><mml:mfrac><mml:mrow><mml:mi>d</mml:mi><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mo>&#x02207;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mi>H</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mtext>&#x02003;&#x000A0;</mml:mtext><mml:mtext class="textrm" mathvariant="normal">and&#x02003;&#x000A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mi>d</mml:mi><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mo>&#x02207;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mi>H</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Intuitively, the Hamiltonian <italic>H</italic> describes the total energy of the system with <bold>y</bold> as the position and <bold>z</bold> as the momentum or velocity. We can separate total energy into potential energy <italic>U</italic> and kinetic energy <italic>T</italic>; that is, <italic>H</italic>(<bold>y</bold>, <bold>z</bold>, <italic>t</italic>) &#x0003D; <italic>U</italic>(<bold>y</bold>)&#x0002B;<italic>T</italic>(<bold>z</bold>). This separability ensures the Hamiltonian dynamics conserve energy; i.e.,</p>
<disp-formula id="E30"><label>(19)</label><mml:math id="M102"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mfrac><mml:mrow><mml:mi>d</mml:mi><mml:mi>H</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>d</mml:mi><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:msub><mml:mrow><mml:mo>&#x02207;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mi>H</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mfrac><mml:mrow><mml:mi>d</mml:mi><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:msub><mml:mrow><mml:mo>&#x02207;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>z</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mi>H</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>In terms of neural networks, energy conservation (<xref ref-type="disp-formula" rid="E30">19</xref>) ensures that network features are preserved during forward propagation, thereby avoiding the issue of exploding/vanishing gradients and enabling the use of deeper networks. Additionally, Hamiltonians are symplectic or volume-preserving in the sense that the dynamics are divergence-free. For neural networks, this ensures the distance between features does not change significantly, avoiding converging and diverging behaviors. We also note that Hamiltonians are time-reversible. This ensures that if we have well-posed dynamics during forward propagation, we will have similar dynamics for backward propagation.</p>
</sec>
<sec>
<title>5.3 Discretizing Hamiltonians with leapfrog integration</title>
<p>To preserve the benefits of Hamiltonians in the discretized setting, we symmetrize the Hamiltonian (Definition 5.1) and use a leapfrog integration method (Skeel, <xref ref-type="bibr" rid="B49">1993</xref>; Ascher, <xref ref-type="bibr" rid="B1">2010</xref>; Haber and Ruthotto, <xref ref-type="bibr" rid="B13">2017</xref>). For t-NNs, we write the new system in terms of the &#x022C6;<sub><italic>M</italic></sub>-product (with slight abuse of notation)</p>
<disp-formula id="E31"><label>(20)</label><mml:math id="M103"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mfrac><mml:mrow><mml:mi>d</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Z</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mn>0</mml:mn></mml:mtd><mml:mtd><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mo>-</mml:mo><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msup></mml:mtd><mml:mtd><mml:mn>0</mml:mn></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Z</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>b</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M104"><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> and <inline-formula><mml:math id="M105"><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Z</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:math></inline-formula>. Here, <inline-formula><mml:math id="M106"><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> indicates the data features and <inline-formula><mml:math id="M107"><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Z</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> is an auxilary variable not related to the data directly. We add the bias tube, <bold>b</bold>(<italic>t</italic>), to each element. The equivalent matricized version of <xref ref-type="disp-formula" rid="E31">Equation (20)</xref> is</p>
<disp-formula id="E32"><label>(21)</label><mml:math id="M109"><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:mfrac><mml:mi>d</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:mfrac><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mtext>unfold</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mover accent='true'><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mi>Y</mml:mi></mml:mstyle><mml:mo stretchy='true'>&#x02192;</mml:mo></mml:mover><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mtext>unfold</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mover accent='true'><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mi>Z</mml:mi></mml:mstyle><mml:mo stretchy='true'>&#x02192;</mml:mo></mml:mover><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mn>0</mml:mn></mml:mstyle></mml:mtd><mml:mtd><mml:mrow><mml:mtext>struct</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mi>W</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mo>&#x02212;</mml:mo><mml:mtext>struct</mml:mtext><mml:msup><mml:mrow><mml:mo stretchy='false'>(</mml:mo><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mi>W</mml:mi></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:msup></mml:mrow></mml:mtd><mml:mtd><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mn>0</mml:mn></mml:mstyle></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext>&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:mrow><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mtext>unfold</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mover accent='true'><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mi>Y</mml:mi></mml:mstyle><mml:mo stretchy='true'>&#x02192;</mml:mo></mml:mover><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mtext>unfold</mml:mtext><mml:mo stretchy='false'>(</mml:mo><mml:mover accent='true'><mml:mstyle mathvariant='bold-script' mathsize='normal'><mml:mi>Z</mml:mi></mml:mstyle><mml:mo stretchy='true'>&#x02192;</mml:mo></mml:mover><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mtext>b</mml:mtext></mml:mstyle><mml:mo stretchy='false'>(</mml:mo><mml:mi>t</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>The (matricized) system (<xref ref-type="disp-formula" rid="E32">21</xref>) is inherently stable, independent of the weight tensors <inline-formula><mml:math id="M110"><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>, because of the block antisymmetric structure. The eigenvalues of antisymmetric matrices are purely imaginary, which exactly satisfy the stability condition in <xref ref-type="disp-formula" rid="E27">Equation (17)</xref>.</p>
<p>We discretize <xref ref-type="disp-formula" rid="E31">Equation (20)</xref> using the leapfrog method, a symplectic integration technique, defined as</p>
<disp-formula id="E33"><label>(22a)</label><mml:math id="M111"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Z</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Z</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:mi>h</mml:mi><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:mrow></mml:msubsup><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>b</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E34"><label>(22b)</label><mml:math id="M112"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>h</mml:mi><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Z</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:mfrac></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>b</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>for <italic>j</italic> &#x0003D; 0, &#x02026;, <italic>d</italic>&#x02212;1. We demonstrate the benefits of stable forward propagation (<xref ref-type="disp-formula" rid="E33">22</xref>) in Example 5.2.</p>
<p>Example 5.2 (Trajectories of stable t-NNs). We construct a dataset in &#x0211D;<sup>3</sup> randomly drawn from a multivariate normal distribution with mean of 0 and a covariance matrix of 3<bold>I</bold><sub>3</sub>. The points are divided into three classes based on distance to the origin with yellow points inside a sphere with radius <italic>r</italic> &#x0003D; 3.5, green points inside a sphere with radius <italic>R</italic> &#x0003D; 5.5, and purple points outside both spheres. We train with 1200 data points and store the data as 1 &#x000D7; 1 &#x000D7; 3 tubes.</p>
<p>We forward propagate using one of two integrators with weights and biases as 1 &#x000D7; 1 &#x000D7; 3 tubes:</p>
<disp-formula id="E35"><label>(23a)</label><mml:math id="M113"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">Forward Euler</mml:mtext></mml:mtd><mml:mtd><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:mi>h</mml:mi><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>w</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>b</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E36"><label>(23b)</label><mml:math id="M114"><mml:mrow><mml:mtext>Leapfrog&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;&#x000A0;</mml:mtext><mml:mrow><mml:mo>{</mml:mo><mml:mtable columnalign='left'><mml:mtr><mml:mtd><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>z</mml:mi></mml:mstyle><mml:mrow><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>z</mml:mi></mml:mstyle><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x02212;</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac></mml:mrow></mml:msub><mml:mo>&#x02212;</mml:mo><mml:mi>h</mml:mi><mml:mi>&#x003C3;</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msubsup><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>w</mml:mi></mml:mstyle><mml:mrow><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>&#x022A4;</mml:mo></mml:msubsup><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>y</mml:mi></mml:mstyle><mml:mi>j</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>b</mml:mi></mml:mstyle><mml:mrow><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>y</mml:mi></mml:mstyle><mml:mrow><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>y</mml:mi></mml:mstyle><mml:mi>j</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:mi>h</mml:mi><mml:mi>&#x003C3;</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>w</mml:mi></mml:mstyle><mml:mrow><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi></mml:msub><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>z</mml:mi></mml:mstyle><mml:mrow><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mn>2</mml:mn></mml:mfrac></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>b</mml:mi></mml:mstyle><mml:mrow><mml:mi>j</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo stretchy='false'>)</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>We create a network with <italic>d</italic> &#x0003D; 32 layers and use a step size of <italic>h</italic> &#x0003D; 2. We use the discrete cosine transform for <bold>M</bold>. We train for 50 epochs using Adam (Kingma and Ba, <xref ref-type="bibr" rid="B24">2015</xref>) with a batch size of 10 and a learning rate of &#x003B3; &#x0003D; 10<sup>&#x02212;2</sup>. To create smoother dynamics, we regularize the weights using <xref ref-type="disp-formula" rid="E28">Equation (18)</xref> with regularization parameter &#x003BB; &#x0003D; 10<sup>&#x02212;4</sup>/<italic>h</italic>. We illustrate the dynamics of trained networks in <xref ref-type="fig" rid="F4">Figure 4</xref>.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>Comparison of feature trajectories for stable t-NNs with forward Euler (23b) and leapfrog integration (23a). Each image contains the features at a particularly layer of a trained network (layers <italic>j</italic> &#x0003D; 0, 4, &#x02026;, 32). The forward Euler network resulted in a test accuracy of 90% and the leapfrog network resulted in a test accuracy of 93.50%. As expected, the leapfrog trajectory is smoother and contains rotational dynamics. The colors are linearly separable at the last layer, indicating good classification performance.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0004.tif"/>
</fig>
<p>The dynamics in <xref ref-type="fig" rid="F4">Figure 4</xref> show the topological benefits of leapfrog integration. The data points exhibit smoother, rotational dynamics to reach the desired linearly-separable final configuration. In comparison, forward Euler propagation significantly changes topology during forward propagation. Such topological changes may yield ill-posed learning problems and poor network generalization.</p></sec>
</sec>
<sec id="s6">
<title>6 Numerical results</title>
<p>We present two image classification problems to compare tensor linear layers and stable tensor neural networks to comparable matrix versions. Overall, the results show that t-NN trained with tubal loss functions generalize better to unseen data than equivalent matrix networks, and can do so with 20&#x02013;30 times fewer network weights.</p>
<sec>
<title>6.1 Experiment setup and hardware</title>
<p>We implement both tensor and matrix frameworks using PyTorch (RRID:SCR_018536) (Paszke et al., <xref ref-type="bibr" rid="B44">2017</xref>). All of the code to reproduce the experiments is provided in <ext-link ext-link-type="uri" xlink:href="https://github.com/elizabethnewman/tnn">https://github.com/elizabethnewman/tnn</ext-link>. All experiments were run on an Exxact server with four RTX A6000 GPUs, each with 48 GB of RAM. Only one GPU was used to generate each result. The results we report are for the networks that yielded the best accuracy on the validation data.</p>
<p>We train all models using the stochastic gradient method Adam (Kingma and Ba, <xref ref-type="bibr" rid="B24">2015</xref>), which uses a default learning rate of 10<sup>&#x02212;3</sup>. In most cases, we select hyperparameters and weight initialization based on the default settings in PyTorch. We indicate the few exceptions to this at the start of the corresponding sections. We also pair the stochastic optimizers with a learning rate scheduler that decreases the learning rate by a factor of &#x003B3; every <italic>M</italic> steps. In all cases, we used the default &#x003B3; &#x0003D; 0.9 and <italic>M</italic> &#x0003D; 100. This is a common practice to ensure convergence of stochastic optimizer in idealized settings (Bottou et al., <xref ref-type="bibr" rid="B3">2018</xref>). We utilize a weight decay parameter in some experiments to reduce overfitting.</p>
<p>For the tensor networks in all experiments, we use the discrete cosine transform for <bold>M</bold>, which is a close, real-valued version of the discrete Fourier transform (DFT). The DFT matrix corresponds to the t-product, which has been shown to be effective for natural image applications (Kilmer and Martin, <xref ref-type="bibr" rid="B23">2011</xref>; Hao et al., <xref ref-type="bibr" rid="B15">2013</xref>; Newman et al., <xref ref-type="bibr" rid="B38">2018</xref>).</p>
</sec>
<sec>
<title>6.2 MNIST dimensionality reduction</title>
<p>The MNIST dataset (LeCun et al., <xref ref-type="bibr" rid="B30">2010</xref>) is composed of 28 &#x000D7; 28 grayscale images of handwritten digits. We train on 50, 000 images and reserve 10, 000 for validation. We report test accuracy on 10, 000 images not used for training nor validation. For NNs, we vectorize images and store as columns, resulting in a matrix of size 28<sup>2</sup>&#x000D7;<italic>b</italic> where <italic>b</italic> is the number of images. For t-NNs, we store the images as lateral slices, resulting in a tensor of size 28 &#x000D7; <italic>b</italic>&#x000D7;28.</p>
<p>In this experiment, we train an autoencoder to efficiently represent the high-dimensional MNIST data in a low-dimensional subspace. Autoencoders can be thought of as nonlinear, parameterized extensions of the (truncated) singular value decomposition. Our goal is to solve the (unsrpervised) learning problem</p>
<disp-formula id="E37"><label>(24)</label><mml:math id="M115"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">min</mml:mo></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mtext>&#x003B8;</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">enc</mml:mtext></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mtext>&#x003B8;</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">dec</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:munder></mml:mstyle><mml:msub><mml:mrow><mml:mi>E</mml:mi></mml:mrow><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle><mml:mo>&#x0007E;</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow></mml:mrow></mml:msub><mml:mtext>&#x000A0;</mml:mtext><mml:mo>||</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">dec</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">enc</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mtext>&#x003B8;</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">enc</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mtext>&#x003B8;</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">dec</mml:mtext></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>-</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle><mml:msubsup><mml:mrow><mml:mo>||</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <inline-formula><mml:math id="M116"><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">enc</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mi>&#x02192;</mml:mi><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Z</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula> is the <italic>encoder</italic> and <inline-formula><mml:math id="M117"><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">dec</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mo>:</mml:mo><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Z</mml:mi></mml:mstyle></mml:mrow><mml:mi>&#x02192;</mml:mi><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula> is the <italic>decoder</italic>. Here, <inline-formula><mml:math id="M118"><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Z</mml:mi></mml:mstyle></mml:mrow></mml:math></inline-formula> is the latent space that is smaller than the data space; in terms of dimension, we say <inline-formula><mml:math id="M119"><mml:mo class="qopname">dim</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Z</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0003C;</mml:mo><mml:mo class="qopname">dim</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. Note that the mean squared error (MSE) tubal loss is the same as the MSE loss function when using an orthogonal transformation matrix, as we have done in our experiments. We describe the NN and t-NN autoencoder architectures used in <xref ref-type="fig" rid="F5">Figure 5</xref> and report results in <xref ref-type="fig" rid="F6">Figure 6</xref>.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>Description of MNIST autoencoder architectures with &#x003C3;(<italic>x</italic>) &#x0003D; tanh(<italic>x</italic>). <bold>(Top)</bold> Four-layer matrix autoencoder NN(<italic>m</italic>,<italic>d</italic>) with first width <italic>m</italic> and latent space dimension <italic>d</italic>. <bold>(Middle)</bold> Four-layer tensor autoencoder t-NN(<italic>m</italic>,<italic>d</italic>) with first width <italic>m</italic> and latent space dimension <italic>d</italic>. For notational simplicity, we omit the vector lateral slice notation for the t-NN. <bold>(Bottom)</bold> Table of networks that we use. We pick sizes relative on the dimensions of t-NN(20,10). The first network NN(40,10) is given more features in the first layer of the encoder. The second network NN(21,280) is given the same number of latent space features and a corresponding width to have roughly the same number of weights as the t-NN. The third network NN(560,280) is given the same number of features on both layers as the t-NN.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0005.tif"/>
</fig>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p><bold>(Left)</bold> Convergence of the loss for the autoencoder example. The t-NN converges to a lower loss more quickly than the NNs. The validation loss closely follows the training loss, indicating good generalization. <bold>(Right)</bold> Autoencoder approximations to test images. The top row contains the true test images <bold>y</bold> and the subsequent rows contains the approximations to the true image <inline-formula><mml:math id="M120"><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">dec</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">end</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>&#x003B8;</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">enc</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>&#x003B8;</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">dec</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> for various autoencoder architectures. To the right of each row, we report the average test error, <inline-formula><mml:math id="M121"><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">test</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mstyle mathvariant="script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">test</mml:mtext></mml:mstyle></mml:mrow></mml:msub></mml:mrow></mml:munder><mml:mo>|</mml:mo><mml:mo>|</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle><mml:mo>-</mml:mo><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>y</mml:mtext></mml:mstyle></mml:mrow><mml:mo>&#x0007E;</mml:mo></mml:mover><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mo>|</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>. Compared to the first two NN autoencoders, the t-NN produces clearer approximations and a test error an order of magnitude smaller. The autoencoder NN(560,280) does produce the smallest test error, but requires over 20 times more network weights than the t-NN autoencoder.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0006.tif"/>
</fig>
<p>We observe that the t-NN outperforms the NN autoencoders with similar numbers of weights with an order of magnitude smaller training and validation loss and test error as well as qualitative improvements of the approximations. The neural network autoencoder with the same feature space dimensions, NN(560,280), performs best in terms of the loss and error metrics, but requires over 20 times more network weights than the t-NN autoencoder. The t-NN layers are able capture spatial correlations more effectively using multilinear operations, resulting in quality approximations with significantly fewer weights.</p>
</sec>
<sec>
<title>6.3 MNIST classification</title>
<p>We use the same MNIST dataset as for the autoencoder example. We train for 20 epochs using Adam with a batch size of 32 and a learning rate of 10<sup>&#x02212;2</sup>. We add Tikhonov regularization (weight decay) with a regularization parameter of &#x003BB; &#x0003D; 10<sup>&#x02212;4</sup>. We use the PyTorch defaults for the other optimizer hyperparameters. For the t-cross entropy loss, we use a squared &#x02113;<sub>2</sub>-norm and normalize by the number of entries in the tube.</p>
<p>We compare four different two-layer neural network architectures, described in <xref ref-type="table" rid="T1">Table 1</xref>. We use either cross entropy loss or t-cross entropy loss, depending on the architecture.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Description of MNIST two-layer network architectures with &#x003C3;(<italic>x</italic>) &#x0003D; tanh(<italic>x</italic>).</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Name</bold></th>
<th valign="top" align="left"><bold>Architecture</bold></th>
<th valign="top" align="left"><bold>Layer 1</bold></th>
<th valign="top" align="left"><bold>Layer 2</bold></th>
<th valign="top" align="center"><bold>|&#x003B8;|</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="2">NN</td>
<td valign="top" align="left" rowspan="2"><bold>W</bold><sub>2</sub>&#x003C3;(<bold>W</bold><sub>1</sub><bold>y</bold><sub>0</sub>&#x0002B;<bold>b</bold><sub>1</sub>)&#x0002B;<bold>b</bold><sub>2</sub></td>
<td valign="top" align="left"><bold>W</bold><sub>1</sub>: 39 &#x000D7; 28<sup>2</sup></td>
<td valign="top" align="left"><bold>W</bold><sub>2</sub>: 10 &#x000D7; 39</td>
<td valign="top" align="center" rowspan="2">31, 015</td>
</tr>
<tr>
<td valign="top" align="left"><bold>b</bold><sub>1</sub>: 39 &#x000D7; 1</td>
<td valign="top" align="left"><bold>b</bold><sub>2</sub>: 10 &#x000D7; 1</td>
</tr> <tr>
<td valign="top" align="left" rowspan="2">NN, square</td>
<td valign="top" align="left" rowspan="2"><bold>W</bold><sub>2</sub>&#x003C3;(<bold>W</bold><sub>1</sub><bold>y</bold><sub>0</sub>&#x0002B;<bold>b</bold><sub>1</sub>)&#x0002B;<bold>b</bold><sub>2</sub></td>
<td valign="top" align="left"><bold>W</bold><sub>1</sub>: 28<sup>2</sup>&#x000D7;28<sup>2</sup></td>
<td valign="top" align="left"><bold>W</bold><sub>2</sub>: 10 &#x000D7; 28<sup>2</sup></td>
<td valign="top" align="center" rowspan="2">623, 290</td>
</tr>
<tr>
<td valign="top" align="left"><bold>b</bold><sub>1</sub>: 28<sup>2</sup>&#x000D7;1</td>
<td valign="top" align="left"><bold>b</bold><sub>2</sub>: 10 &#x000D7; 1</td>
</tr> <tr>
<td valign="top" align="left" rowspan="2">t-NN</td>
<td valign="top" align="left" rowspan="2"><inline-formula><mml:math id="M122"><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>W</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">unfold</mml:mtext></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mstyle mathvariant='bold'><mml:mtext>b</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula></td>
<td valign="top" align="left"><inline-formula><mml:math id="M123"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: 28 &#x000D7; 28 &#x000D7; 28</td>
<td valign="top" align="left"><bold>W</bold><sub>2</sub>: 10 &#x000D7; 28<sup>2</sup></td>
<td valign="top" align="center" rowspan="2">30, 586</td>
</tr>
<tr>
<td valign="top" align="left"><inline-formula><mml:math id="M124"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: 28 &#x000D7; 1 &#x000D7; 28</td>
<td valign="top" align="left"><bold>b</bold><sub>2</sub>: 10 &#x000D7; 1</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="2">t-NN, t-loss</td>
<td valign="top" align="left" rowspan="2"><inline-formula><mml:math id="M125"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi><mml:mi>&#x003C3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x022C6;</mml:mo><mml:mi>M</mml:mi><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula></td>
<td valign="top" align="left"><inline-formula><mml:math id="M126"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: 28 &#x000D7; 28 &#x000D7; 28</td>
<td valign="top" align="left"><inline-formula><mml:math id="M127"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: 10 &#x000D7; 28 &#x000D7; 28</td>
<td valign="top" align="center" rowspan="2">30, 856</td>
</tr>
<tr>
<td valign="top" align="left"><inline-formula><mml:math id="M128"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: 28 &#x000D7; 1 &#x000D7; 28</td>
<td valign="top" align="left"><inline-formula><mml:math id="M129"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: 10 &#x000D7; 1 &#x000D7; 28</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The t-NN architectures were chosen to preserve the size of the images. The NN width of 39 is chosen to be as small as possible such that the NN architecture does not have fewer parameters than the t-NN architecture. The final column reports the total number of learnable weights.</p>
</table-wrap-foot>
</table-wrap>
<p>We report the convergence and accuracy results in <xref ref-type="fig" rid="F7">Figure 7</xref> and <xref ref-type="table" rid="T2">Table 2</xref>, respectively.</p>
<fig id="F7" position="float">
<label>Figure 7</label>
<caption><p>Loss and accuracy convergence for MNIST with four different two-layer neural networks. To see the differences clearly, we omit the initial accuracy, which was close to 10% for each network. The t-NN with cross entropy loss (orange diamonds <inline-graphic xlink:href="fdata-07-1363978-i0001.tif"/>) produces the best training (darker, solid) and validation (lighter, dashed) accuracy. The t-NN with t-loss (green squares <inline-graphic xlink:href="fdata-07-1363978-i0002.tif"/>) performs second best with in terms of accuracy, demonstrating the benefits of tensor operator layers. Despite having the greatest number of weights, the NN with square weights (purple &#x000D7; &#x00027;s) performs worst in terms of accuracy.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0007.tif"/>
</fig>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>MNIST training, validation, and test accuracy per class and overall for the four architectures.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left" colspan="2"></th>
<th valign="top" align="left"><bold>0</bold></th>
<th valign="top" align="left"><bold>1</bold></th>
<th valign="top" align="left"><bold>2</bold></th>
<th valign="top" align="left"><bold>3</bold></th>
<th valign="top" align="left"><bold>4</bold></th>
<th valign="top" align="left"><bold>5</bold></th>
<th valign="top" align="left"><bold>6</bold></th>
<th valign="top" align="left"><bold>7</bold></th>
<th valign="top" align="left"><bold>8</bold></th>
<th valign="top" align="left"><bold>9</bold></th>
<th valign="top" align="left"><bold>Overall</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="4">Train</td>
<td valign="top" align="left">NN</td>
<td valign="top" align="left">97.05</td>
<td valign="top" align="left">96.54</td>
<td valign="top" align="left">90.58</td>
<td valign="top" align="left">89.73</td>
<td valign="top" align="left">93.03</td>
<td valign="top" align="left">88.91</td>
<td valign="top" align="left">92.36</td>
<td valign="top" align="left">91.91</td>
<td valign="top" align="left">87.79</td>
<td valign="top" align="left">84.57</td>
<td valign="top" align="center">91.33</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">NN, square</td>
<td valign="top" align="left">96.67</td>
<td valign="top" align="left">93.85</td>
<td valign="top" align="left">92.58</td>
<td valign="top" align="left">84.82</td>
<td valign="top" align="left">94.84</td>
<td valign="top" align="left">89.11</td>
<td valign="top" align="left">97.91</td>
<td valign="top" align="left">88.76</td>
<td valign="top" align="left">77.98</td>
<td valign="top" align="left">85.56</td>
<td valign="top" align="center">90.24</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">t-NN</td>
<td valign="top" align="left"><bold>98.98</bold></td>
<td valign="top" align="left"><bold>99.64</bold></td>
<td valign="top" align="left"><bold>97.12</bold></td>
<td valign="top" align="left"><bold>97.88</bold></td>
<td valign="top" align="left"><bold>98.29</bold></td>
<td valign="top" align="left"><bold>96.06</bold></td>
<td valign="top" align="left"><bold>98.68</bold></td>
<td valign="top" align="left"><bold>97.60</bold></td>
<td valign="top" align="left"><bold>93.30</bold></td>
<td valign="top" align="left"><bold>95.59</bold></td>
<td valign="top" align="center"><bold>97.36</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="left">t-NN, t-loss</td>
<td valign="top" align="left">97.28</td>
<td valign="top" align="left">97.06</td>
<td valign="top" align="left">95.97</td>
<td valign="top" align="left">91.09</td>
<td valign="top" align="left">94.92</td>
<td valign="top" align="left">92.50</td>
<td valign="top" align="left">97.26</td>
<td valign="top" align="left">92.26</td>
<td valign="top" align="left">91.99</td>
<td valign="top" align="left">91.61</td>
<td valign="top" align="center">94.22</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="4"> Valid</td>
<td valign="top" align="left">NN</td>
<td valign="top" align="left">96.90</td>
<td valign="top" align="left">96.46</td>
<td valign="top" align="left">89.50</td>
<td valign="top" align="left">89.79</td>
<td valign="top" align="left">92.13</td>
<td valign="top" align="left">89.94</td>
<td valign="top" align="left">92.28</td>
<td valign="top" align="left">91.33</td>
<td valign="top" align="left">88.85</td>
<td valign="top" align="left">81.02</td>
<td valign="top" align="center">90.94</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">NN, square</td>
<td valign="top" align="left">94.50</td>
<td valign="top" align="left">93.36</td>
<td valign="top" align="left">90.60</td>
<td valign="top" align="left">84.68</td>
<td valign="top" align="left">94.99</td>
<td valign="top" align="left">89.83</td>
<td valign="top" align="left"><bold>97.36</bold></td>
<td valign="top" align="left">89.26</td>
<td valign="top" align="left">77.81</td>
<td valign="top" align="left">82.16</td>
<td valign="top" align="center">89.52</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">t-NN</td>
<td valign="top" align="left"><bold>97.60</bold></td>
<td valign="top" align="left"><bold>99.29</bold></td>
<td valign="top" align="left"><bold>95.40</bold></td>
<td valign="top" align="left"><bold>97.01</bold></td>
<td valign="top" align="left"><bold>97.34</bold></td>
<td valign="top" align="left"><bold>95.66</bold></td>
<td valign="top" align="left">96.75</td>
<td valign="top" align="left"><bold>97.08</bold></td>
<td valign="top" align="left"><bold>92.47</bold></td>
<td valign="top" align="left"><bold>93.26</bold></td>
<td valign="top" align="center"><bold>96.26</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="left">t-NN, t-loss</td>
<td valign="top" align="left">96.20</td>
<td valign="top" align="left">97.08</td>
<td valign="top" align="left">94.80</td>
<td valign="top" align="left">90.94</td>
<td valign="top" align="left">93.46</td>
<td valign="top" align="left">93.60</td>
<td valign="top" align="left">97.26</td>
<td valign="top" align="left">92.46</td>
<td valign="top" align="left">92.16</td>
<td valign="top" align="left">89.21</td>
<td valign="top" align="center">93.76</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="4">Test</td>
<td valign="top" align="left">NN</td>
<td valign="top" align="left">98.16</td>
<td valign="top" align="left">97.18</td>
<td valign="top" align="left">89.44</td>
<td valign="top" align="left">92.18</td>
<td valign="top" align="left">93.08</td>
<td valign="top" align="left">88.00</td>
<td valign="top" align="left">90.50</td>
<td valign="top" align="left">89.49</td>
<td valign="top" align="left">87.58</td>
<td valign="top" align="left">85.43</td>
<td valign="top" align="center">91.20</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">NN, square</td>
<td valign="top" align="left">97.14</td>
<td valign="top" align="left">94.89</td>
<td valign="top" align="left">92.44</td>
<td valign="top" align="left">85.84</td>
<td valign="top" align="left">95.01</td>
<td valign="top" align="left">88.79</td>
<td valign="top" align="left"><bold>97.49</bold></td>
<td valign="top" align="left">87.45</td>
<td valign="top" align="left">78.23</td>
<td valign="top" align="left">83.35</td>
<td valign="top" align="center">90.11</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">t-NN</td>
<td valign="top" align="left"><bold>98.67</bold></td>
<td valign="top" align="left"><bold>99.38</bold></td>
<td valign="top" align="left"><bold>95.25</bold></td>
<td valign="top" align="left"><bold>97.03</bold></td>
<td valign="top" align="left"><bold>98.37</bold></td>
<td valign="top" align="left"><bold>94.96</bold></td>
<td valign="top" align="left">97.08</td>
<td valign="top" align="left"><bold>96.50</bold></td>
<td valign="top" align="left">93.74</td>
<td valign="top" align="left"><bold>94.05</bold></td>
<td valign="top" align="center"><bold>96.55</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="left">t-NN, t-loss</td>
<td valign="top" align="left">98.16</td>
<td valign="top" align="left">98.33</td>
<td valign="top" align="left">95.35</td>
<td valign="top" align="left">93.66</td>
<td valign="top" align="left">95.21</td>
<td valign="top" align="left">91.59</td>
<td valign="top" align="left">97.18</td>
<td valign="top" align="left">90.66</td>
<td valign="top" align="left"><bold>93.94</bold></td>
<td valign="top" align="left">91.08</td>
<td valign="top" align="center">94.57</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The t-NN architecture with cross entropy loss consistently produces the highest accuracy. The bolded values indicate the highest accuracy for the class and dataset.</p>
</table-wrap-foot>
</table-wrap>
<p>The t-NN architecture with cross entropy loss outperforms all networks in terms of test accuracy and accuracy per class. The second-best performing network is the t-NN with t-loss. These results are evidence that the features learned from the tensor linear layer (layer 1) are better than those learned by a dense matrix layer. We further note that matrix network NN with square weights has the same final layer shape as the t-NN with cross entropy loss; the only difference between the networks is the first layer. We depict the learned features of each network that preserves the size of the images in <xref ref-type="fig" rid="F8">Figure 8</xref>. The t-NN features from the first layer contain more structure than the NN features with square weights. This reflects the &#x022C6;<sub><italic>M</italic></sub>-operation, which first acts along the tubes (rows of the images in <xref ref-type="fig" rid="F8">Figure 8</xref>). We also observe that the features of NN are more extreme, close to &#x0002B;1 and &#x02212;1, the limits of the range of the activation function &#x003C3;(<italic>x</italic>) &#x0003D; tanh(<italic>x</italic>). In comparison, the features extracted from the t-NN with t-loss offer more variety of entries, but still hits the extreme values often. This demonstrates that t-NNs still produce rich feature spaces and are able to achieve these features with about 20 times fewer weights.</p>
<fig id="F8" position="float">
<label>Figure 8</label>
<caption><p>Features from first layer of NN, square, t-NN with cross entropy, and t-NN with t-cross entropy networks. Both t-NN features contain more structure because the &#x022C6;<sub><italic>M</italic></sub>-operation respects spatial correlations.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0008.tif"/>
</fig>
<p>We note that the t-NN network with t-cross entropy loss performs well, and we gain insight into the beneifts of t-losses from the accuracy per class in <xref ref-type="fig" rid="F9">Figure 9</xref>. We observe that when we use tubal losses, we require high values for many frontal slices. This creates a more rigorous classification requirement and, as we will see in subsequent experiments, can yield networks that generalize better. Additionally, the distribution of values in the tubal softmax function is reflective of the predicted class. For the second image (true = 3), the two most likely predicted classes were, in order, 3 and 8. Qualitatively, the particularly handwritten 3 has similarities to the digit 8, and the tubal softmax captures this similarity. For the cases that were incorrectly predicted the digit, the handwritten image had structure emblematic of the predicted class and second most likely predicted class matched the true label.</p>
<fig id="F9" position="float">
<label>Figure 9</label>
<caption><p>Illustration of t-softmax of MNIST test images using t-NN with t-loss. The top row are the test images <inline-formula><mml:math id="M130"><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>28</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>28</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> and the bottom row are the values of the tubal softmax of the output <inline-formula><mml:math id="M131"><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">tnn</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>Y</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover><mml:mo>,</mml:mo><mml:mstyle mathvariant='bold'><mml:mtext>&#x003B8;</mml:mtext></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02208;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x0211D;</mml:mi></mml:mrow><mml:mrow><mml:mn>10</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x000D7;</mml:mo><mml:mn>28</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>, shown in the transform domain. Each row of the tubal softmax images corresponds to a different class and each column to a different frontal slice. The row with the largest &#x02113;<sub>2</sub>-norm corresponds to the predicted class, where the top row corresponds to class 0 and the bottom row corresponds to class 9. The left two images were predicted correctly and the right two images were predicted incorrectly.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0009.tif"/>
</fig>
</sec>
<sec>
<title>6.4 CIFAR-10</title>
<p>The CIFAR10 dataset (Krizhevsky and Hinton, <xref ref-type="bibr" rid="B27">2009</xref>) is composed of 32 &#x000D7; 32 &#x000D7; 3 RGB natural images belonging to ten classes. We train on 40, 000 images and reserve 10, 000 for validation. We report test accuracy on 10, 000 images not used for training nor validation. For NNs, we vectorize images and store as columns, resulting in a matrix of size (3&#x000B7;32<sup>2</sup>) &#x000D7; <italic>b</italic> where <italic>b</italic> is the number of images. For t-NNs, we store the images as lateral slices and stack the color channels vertically, resulting in a tensor of size (3&#x000B7;32) &#x000D7; <italic>b</italic>&#x000D7;32. We train for 500 epochs using Adam with a batch size of 32. We use a learning rate of 10<sup>&#x02212;3</sup> that decays after every 100 epochs by a factor of 0.9. We use the PyTorch defaults for the other optimizer hyperparameters. For the t-cross entropy loss, we use a squared &#x02113;<sub>2</sub>-norm and normalize by the number of entries in the tube.</p>
<p>We compare the performance of the Hamiltonian networks with dense matrix operators and dense tensor operators for various numbers of layers (<italic>d</italic> &#x0003D; 4, 8). In conjunction with the Hamiltonian network, we use the smoothing regularizer (<xref ref-type="disp-formula" rid="E31">20</xref>) with a regularization parameter of &#x003BB; &#x0003D; 10<sup>&#x02212;2</sup>. We describe the network architectures and number of parameters in <xref ref-type="table" rid="T3">Table 3</xref>. The NN architectures require more than 30 times the number of weights than the t-NN architectures. We compare the convergence and accuracy results in <xref ref-type="fig" rid="F10">Figure 10</xref> and <xref ref-type="table" rid="T4">Table 4</xref>, respectively.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Description of CIFAR-10 Hamiltonian network architectures with &#x003C3;(<italic>x</italic>) &#x0003D; tanh(<italic>x</italic>).</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Name</bold></th>
<th valign="top" align="left"><bold>Hamiltonian layers</bold></th>
<th valign="top" align="left"><bold>Final layer</bold></th>
<th valign="top" align="left"><bold>|&#x003B8;|</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="2">NN</td>
<td valign="top" align="left"><bold>W</bold><sub><italic>j</italic></sub>: (3&#x000B7;32<sup>2</sup>) &#x000D7; (3&#x000B7;32<sup>2</sup>)</td>
<td valign="top" align="left"><bold>W</bold><sub><italic>d</italic>&#x0002B;1</sub>: 10 &#x000D7; 3072</td>
<td valign="top" align="left">4 layers: 37, 779, 470</td>
</tr>
<tr>
<th valign="top" align="left"><bold><italic>b</italic><sub><italic>j</italic></sub>: 1 &#x000D7; 1</bold></th>
<th valign="top" align="left"><bold>b<sub><italic>d</italic>&#x0002B;1</sub>: 10 &#x000D7; 1</bold></th>
<th valign="top" align="left"><bold>8 layers: 75, 528, 210</bold></th>
</tr>
<tr>
<td valign="top" align="left" rowspan="2">t-NN</td>
<td valign="top" align="left"><bold>W</bold><sub><italic>j</italic></sub>: (3&#x000B7;32) &#x000D7; (3&#x000B7;32) &#x000D7; 32</td>
<td valign="top" align="left"><bold>W</bold><sub><italic>d</italic>&#x0002B;1</sub>: 10 &#x000D7; (3&#x000B7;32<sup>2</sup>)</td>
<td valign="top" align="left">4 layers: 1, 210, 506</td>
</tr>
<tr>
<td valign="top" align="left"><bold>b</bold><sub><italic>j</italic></sub>: 1 &#x000D7; 1 &#x000D7; 32</td>
<td valign="top" align="left"><bold>b</bold><sub><italic>d</italic>&#x0002B;1</sub>: 10 &#x000D7; 1</td>
<td valign="top" align="left">8 layers: 2, 390, 282</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="2">t-NN, t-loss</td>
<td valign="top" align="left"><inline-formula><mml:math id="M132"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>: (3&#x000B7;32) &#x000D7; (3&#x000B7;32) &#x000D7; 32</td>
<td valign="top" align="left"><inline-formula><mml:math id="M133"><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>W</mml:mi></mml:mstyle></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: 10 &#x000D7; (3&#x000B7;32) &#x000D7; 32</td>
<td valign="top" align="left">4 layers: 1, 210, 816</td>
</tr>
<tr>
<td valign="top" align="left"><inline-formula><mml:math id="M134"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula>: 1 &#x000D7; 1 &#x000D7; 32</td>
<td valign="top" align="left"><inline-formula><mml:math id="M135"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mstyle mathvariant="bold-script"><mml:mi>B</mml:mi></mml:mstyle></mml:mrow><mml:mo>&#x02192;</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>: 10 &#x000D7; 1 &#x000D7; 32</td>
<td valign="top" align="left">8 layers: 2, 390, 592</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The architectures were chosen to preserve the sizes of the CIFAR-10 images. The final column |<italic><bold>&#x003B8;</bold></italic>| reports the total number of learnable weights.</p>
</table-wrap-foot>
</table-wrap>
<fig id="F10" position="float">
<label>Figure 10</label>
<caption><p>Loss and accuracy convergence for CIFAR-10 with different Hamiltonian network depths. We only show the convergence for the t-NN with t-cross entropy loss, which achieved a top validation accuracy of at least 54.37%, compared to the t-NN with cross entropy loss, which topped out at 54.32%. For the accuracy, we start with epoch 5 to highlight the differences between networks.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0010.tif"/>
</fig>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>CIFAR-10 training, validation, and test accuracy per class and overall for the four architectures.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th/>
<th/>
<th valign="top" align="left"><bold>Plane</bold></th>
<th valign="top" align="left"><bold>Car</bold></th>
<th valign="top" align="left"><bold>Bird</bold></th>
<th valign="top" align="left"><bold>Cat</bold></th>
<th valign="top" align="left"><bold>Deer</bold></th>
<th valign="top" align="left"><bold>Dog</bold></th>
<th valign="top" align="left"><bold>Frog</bold></th>
<th valign="top" align="left"><bold>Horse</bold></th>
<th valign="top" align="left"><bold>Ship</bold></th>
<th valign="top" align="left"><bold>Truck</bold></th>
<th valign="top" align="center"><bold>Overall</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="4">Train</td>
<td valign="top" align="left">NN4</td>
<td valign="top" align="left"><bold>99.95</bold></td>
<td valign="top" align="left"><bold>100.00</bold></td>
<td valign="top" align="left"><bold>99.95</bold></td>
<td valign="top" align="left"><bold>100.00</bold></td>
<td valign="top" align="left"><bold>99.98</bold></td>
<td valign="top" align="left"><bold>99.97</bold></td>
<td valign="top" align="left"><bold>100.00</bold></td>
<td valign="top" align="left"><bold>99.97</bold></td>
<td valign="top" align="left"><bold>100.00</bold></td>
<td valign="top" align="left"><bold>100.00</bold></td>
<td valign="top" align="center"><bold>99.98</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="left">NN8</td>
<td valign="top" align="left">99.90</td>
<td valign="top" align="left">99.97</td>
<td valign="top" align="left">99.80</td>
<td valign="top" align="left">99.95</td>
<td valign="top" align="left">99.93</td>
<td valign="top" align="left">100.00</td>
<td valign="top" align="left">100.00</td>
<td valign="top" align="left">100.00</td>
<td valign="top" align="left">99.93</td>
<td valign="top" align="left">99.93</td>
<td valign="top" align="center">99.94</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">t-NN4</td>
<td valign="top" align="left">74.12</td>
<td valign="top" align="left">79.97</td>
<td valign="top" align="left">60.44</td>
<td valign="top" align="left">43.02</td>
<td valign="top" align="left">59.82</td>
<td valign="top" align="left">65.27</td>
<td valign="top" align="left">65.66</td>
<td valign="top" align="left">88.47</td>
<td valign="top" align="left">77.14</td>
<td valign="top" align="left">74.83</td>
<td valign="top" align="center">68.84</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">t-NN8</td>
<td valign="top" align="left">76.09</td>
<td valign="top" align="left">81.15</td>
<td valign="top" align="left">61.12</td>
<td valign="top" align="left">35.80</td>
<td valign="top" align="left">69.04</td>
<td valign="top" align="left">58.83</td>
<td valign="top" align="left">62.52</td>
<td valign="top" align="left">91.45</td>
<td valign="top" align="left">76.94</td>
<td valign="top" align="left">75.28</td>
<td valign="top" align="center">68.79</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="4">Valid</td>
<td valign="top" align="left">NN4</td>
<td valign="top" align="left">54.57</td>
<td valign="top" align="left">62.77</td>
<td valign="top" align="left">40.85</td>
<td valign="top" align="left">29.58</td>
<td valign="top" align="left">39.36</td>
<td valign="top" align="left">42.77</td>
<td valign="top" align="left">58.24</td>
<td valign="top" align="left">59.52</td>
<td valign="top" align="left"><bold>70.84</bold></td>
<td valign="top" align="left">60.06</td>
<td valign="top" align="center">51.99</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">NN8</td>
<td valign="top" align="left"><bold>62.08</bold></td>
<td valign="top" align="left">65.54</td>
<td valign="top" align="left">39.76</td>
<td valign="top" align="left"><bold>33.92</bold></td>
<td valign="top" align="left">38.53</td>
<td valign="top" align="left">43.36</td>
<td valign="top" align="left"><bold>62.34</bold></td>
<td valign="top" align="left">56.62</td>
<td valign="top" align="left">70.74</td>
<td valign="top" align="left">59.06</td>
<td valign="top" align="center">53.30</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">t-NN4</td>
<td valign="top" align="left">59.51</td>
<td valign="top" align="left">68.32</td>
<td valign="top" align="left"><bold>49.11</bold></td>
<td valign="top" align="left">32.68</td>
<td valign="top" align="left">49.53</td>
<td valign="top" align="left"><bold>52.25</bold></td>
<td valign="top" align="left">51.41</td>
<td valign="top" align="left"><bold>73.62</bold></td>
<td valign="top" align="left">67.64</td>
<td valign="top" align="left"><bold>62.86</bold></td>
<td valign="top" align="center"><bold>56.83</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="left">t-NN8</td>
<td valign="top" align="left">60.43</td>
<td valign="top" align="left"><bold>66.63</bold></td>
<td valign="top" align="left">47.12</td>
<td valign="top" align="left">23.37</td>
<td valign="top" align="left"><bold>53.79</bold></td>
<td valign="top" align="left">44.82</td>
<td valign="top" align="left">47.90</td>
<td valign="top" align="left">73.24</td>
<td valign="top" align="left">65.33</td>
<td valign="top" align="left">59.86</td>
<td valign="top" align="center">54.37</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="4">Test</td>
<td valign="top" align="left">NN4</td>
<td valign="top" align="left">57.30</td>
<td valign="top" align="left">64.70</td>
<td valign="top" align="left">40.60</td>
<td valign="top" align="left">28.10</td>
<td valign="top" align="left">38.30</td>
<td valign="top" align="left">42.10</td>
<td valign="top" align="left">57.30</td>
<td valign="top" align="left">58.20</td>
<td valign="top" align="left"><bold>69.20</bold></td>
<td valign="top" align="left">57.90</td>
<td valign="top" align="center">51.37</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">NN8</td>
<td valign="top" align="left"><bold>62.60</bold></td>
<td valign="top" align="left">63.10</td>
<td valign="top" align="left">40.70</td>
<td valign="top" align="left">33.20</td>
<td valign="top" align="left">39.10</td>
<td valign="top" align="left">44.80</td>
<td valign="top" align="left"><bold>61.60</bold></td>
<td valign="top" align="left">55.30</td>
<td valign="top" align="left">69.00</td>
<td valign="top" align="left">55.80</td>
<td valign="top" align="center">52.52</td>
</tr>
<tr>
<td/>
<td valign="top" align="left">t-NN4</td>
<td valign="top" align="left">61.00</td>
<td valign="top" align="left">66.90</td>
<td valign="top" align="left">48.40</td>
<td valign="top" align="left"><bold>34.40</bold></td>
<td valign="top" align="left">48.10</td>
<td valign="top" align="left"><bold>54.70</bold></td>
<td valign="top" align="left">54.50</td>
<td valign="top" align="left">72.50</td>
<td valign="top" align="left">67.30</td>
<td valign="top" align="left"><bold>63.30</bold></td>
<td valign="top" align="center"><bold>57.11</bold></td>
</tr>
<tr>
<td/>
<td valign="top" align="left">t-NN8</td>
<td valign="top" align="left"><bold>62.60</bold></td>
<td valign="top" align="left"><bold>67.30</bold></td>
<td valign="top" align="left"><bold>48.70</bold></td>
<td valign="top" align="left">25.20</td>
<td valign="top" align="left"><bold>52.50</bold></td>
<td valign="top" align="left">46.70</td>
<td valign="top" align="left">50.20</td>
<td valign="top" align="left"><bold>72.90</bold></td>
<td valign="top" align="left">63.40</td>
<td valign="top" align="left">60.10</td>
<td valign="top" align="center">54.96</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The t-NN architectures with t-cross entropy loss produce the highest overall validation and test accuracy, the traditional metrics to indicate generalization ability. The bolded values indicate the highest accuracy for the class and dataset.</p>
</table-wrap-foot>
</table-wrap>
<p>There are several key takeaways from the numerical results. First, the depth of the network did not significantly change performance in this experiment. We state this observation cautiously. We observe this behavior for a certain set of fixed hyperparameters (e.g., step size <italic>h</italic>, learning rate, regularization parameter &#x003BB;,...). The interaction of the hyperparameters and performance is complex and a complete ablation study is outside of the scope of this paper. A second takeaway is that the t-NN trained with the tubal loss generalizes better the NN networks and better than t-NNs with cross entropy loss (not shown for simplicity; see <xref ref-type="fig" rid="F10">Figure 10</xref> for details). This behavior is especially apparent when looking at the test loss in <xref ref-type="table" rid="T4">Table 4</xref>. The t-NN with four Hamiltonian layers and t-loss performs well overall, obtaining almost 5% better overall test accuracy. In comparison, the matrix NN quickly overfits the training data and thus does not generalize as well. In terms of the test accuracy per class, the t-NN architectures achieve the best performance in all but two classes.</p>
<p>To look into the performance further, we examine the extracted features of Hamiltonian NNs and t-NNs in <xref ref-type="fig" rid="F11">Figure 11</xref>. We see that the NN and t-NN features share similarities. Both features gradually remove the structure of the original image at similar rates. The t-NN architecture achieves this pattern with significantly fewer network weights (over 30 times fewer). The noisy artifacts differ between the two architectures. In particular, we see that the t-NN layers produce blockier artifacts because of the structured &#x022C6;<sub><italic>M</italic></sub>-operation.</p>
<fig id="F11" position="float">
<label>Figure 11</label>
<caption><p>Features from the trained 4-layered Hamiltonian networks of both the matrix and tensor parameterized cases for four different training images (top-to-bottom: dog, horse, truck, ship). For the t-NN, we use the better-performing network with t-cross entropy loss. Here, Layer 0 shows the separated color channels of the original images.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0011.tif"/>
</fig>
<p>In the last numerical study in <xref ref-type="fig" rid="F12">Figure 12</xref>, we explore how quickly we can train Hamiltonian t-NNs compared to NNs. In addition to an order of magnitude fewer weights, training t-NNs takes less time than training NNs. As we increase the depth of the networks, we see that each NN epoch takes approximately 1.75 times longer to complete. This performance could potentially be further improved if we optimized the &#x022C6;<sub><italic>M</italic></sub>-product, e.g., using fast transforms instead of matrix multiplication.</p>
<fig id="F12" position="float">
<label>Figure 12</label>
<caption><p><bold>(Left)</bold> Average time per epoch to train a Hamiltonian network for a fixed batch size. We ran each depth for five epochs and the maximum standard deviation of time was on the order of 10<sup>&#x02212;2</sup> relative to the average time. <bold>(Right)</bold> Time ratio average <italic>r</italic> &#x0003D; NN epoch/t-NN epoch. As the depth of the network grows, the time per epoch for NNs takes almost 1.75 times longer than for t-NNs.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0012.tif"/>
</fig>
</sec>
<sec>
<title>6.5 CIFAR-100</title>
<p>The CIFAR100 dataset (Krizhevsky and Hinton, <xref ref-type="bibr" rid="B27">2009</xref>) is composed of 32 &#x000D7; 32 &#x000D7; 3 RGB natural images belonging to 100 classes. We train on 40, 000 images and reserve 10, 000 for validation. We report test accuracy on 10, 000 images not used for training nor validation. We use the same setup and training parameters as the CIFAR10 experiment (Section 6.4). For all experiments, we use Hamiltonian networks with a depth of <italic>d</italic> &#x0003D; 16, a step size of <italic>h</italic> &#x0003D; 0.25, and a regularization parameter &#x003BB; &#x0003D; 10<sup>&#x02212;2</sup>. We report the accuracy results in <xref ref-type="fig" rid="F13">Figure 13</xref>.</p>
<fig id="F13" position="float">
<label>Figure 13</label>
<caption><p><bold>(Left)</bold> Convergence of the accuracy for the CIFAR-100 experiment. To delineate the performance on the validation data, we show the first 100 epochs out of 500 total. The t-NN with t-loss converges to the highest validation accuracy and avoids the generalization gap longest out of all presented networks. <bold>(Right)</bold> Final accuracy for the training, validation, and test data. We report the results using the networks that produced the highest validation accuracy during training.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1363978-g0013.tif"/>
</fig>
<p>Observing the results in <xref ref-type="fig" rid="F13">Figure 13</xref>, the conclusions are the same as in the CIFAR-10 experiment. Specifically, training with the t-NN and t-loss produces the best test accuracy, about a 4% improvement from the comparable NN network. This demonstrates that the benefits of dense tensor operations over dense matrix operations can be realized for more challenging classification problems and motivates further development of these tools to improve state-of-the-art convolutional neural networks and other architectures.</p></sec>
</sec>
<sec sec-type="conclusions" id="s7">
<title>7 Conclusion</title>
<p>We presented tensor neural networks (t-NNs) as a new approach to parameterize fully-connected neural networks. We operate using the &#x022C6;<sub><italic>M</italic></sub>-product which can reduce the number of network weights by an order of magnitude while maintaining the same expressiveness. We introduced tubal loss functions that are an algebraically-consistent t-NN architecture. Because the &#x022C6;<sub><italic>M</italic></sub>-framework gives rise to a tensor algebra that preserves matrix properties, we extended the notion of stable neural networks to t-NNs, which enable the development of deeper, more expressive networks. Through numerical experiments on benchmark image classification tasks, we demonstrated that t-NNs offer a more efficient parameterization and, when trained with tubal loss functions, can generalize better to unseen data.</p>
<p>Our work opens the door to several natural extensions. First, we note that while this paper focused on imaging benchmark problems in machine learning, the &#x022C6;<sub><italic>M</italic></sub>-framework can be applied to many data sets, including dynamic graphs (Malik et al., <xref ref-type="bibr" rid="B35">2021</xref>), longitudinal omics data (Mor et al., <xref ref-type="bibr" rid="B36">2022</xref>), and functional magnetic resonance imaging (fMRI) (Keegan et al., <xref ref-type="bibr" rid="B19">2022</xref>). Second, we could use tensor parameterizations to improve convolutional neural networks (CNNs), just as we used t-NN layers to improve fully-connected networks. CNNs are state-of-the-art for image classification and rely on convolution operations. The &#x022C6;<sub><italic>M</italic></sub>-product is, in some sense, a convolution based on the transformation <bold>M</bold>; in fact, when <bold>M</bold> is the discrete Fourier transform, the result is a circulant convolution. A t-CNN could offer more efficient parameterization and a greater range of convolutional features that could increase the expressibility of the network. Third, we could extend the use of tubal loss functions to any network architecture. Tubal loss functions offer more stringent requirements to fitting data which can mitigate overfitting. Additionally, tubal loss functions foster a new level of flexibility to evaluate performance, such as various norms to transform tubal probabilities into scalars and new measures of accuracy per frontal slice. Fourth, we can consider learning the operator <bold>M</bold> based on the data or allowing the operator to evolve with the layers. Lastly, we can explore methods to improve t-NN efficiency on CPUs and GPUs by exploiting the parallelize of the &#x022C6;<sub><italic>M</italic></sub>-products.</p></sec>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found at: <ext-link ext-link-type="uri" xlink:href="https://pytorch.org/vision/main/datasets.html">https://pytorch.org/vision/main/datasets.html</ext-link> and <ext-link ext-link-type="uri" xlink:href="https://github.com/elizabethnewman/tnn">https://github.com/elizabethnewman/tnn</ext-link>.</p></sec>
<sec sec-type="author-contributions" id="s9">
<title>Author contributions</title>
<p>EN: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing&#x02014;original draft, Writing&#x02014;review &#x00026; editing. LH: Conceptualization, Data curation, Funding acquisition, Writing&#x02014;original draft, Writing&#x02014;review &#x00026; editing. HA: Conceptualization, Data curation, Writing&#x02014;original draft, Writing&#x02014;review &#x00026; editing. MK: Funding acquisition, Writing&#x02014;original draft, Writing&#x02014;review &#x00026; editing.</p></sec>
</body>
<back>
<sec sec-type="funding-information" id="s10">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was partially funded by the Exploratory Science program at IBM. EN&#x00027;s work was partially supported by the National Science Foundation (NSF) under grants [DMS-2309751]. HA&#x00027;s work was partially funded an IBM Faculty Award. Any opinions, findings, conclusions, or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the National Science Foundation. The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</p>
</sec>
<ack><p>We thank Dr. Eldad Haber and Dr. Lars Ruthotto for providing additional insight on their work which helped us generalize their ideas to our high-dimensional framework.</p>
</ack>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>LH is employed by IBM. The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ascher</surname> <given-names>U. M.</given-names></name></person-group> (<year>2010</year>). <source>Numerical Methods for Evolutionary Differential Equations</source>. <publisher-loc>Philadelphia, PA</publisher-loc>: <publisher-name>SIAM</publisher-name>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bengio</surname> <given-names>Y.</given-names></name> <name><surname>Simard</surname> <given-names>P.</given-names></name> <name><surname>Frasconi</surname> <given-names>P.</given-names></name></person-group> (<year>1994</year>). <article-title>Learning long-term dependencies with gradient descent is difficult</article-title>. <source>IEEE Trans. Neural Netw</source>. <volume>5</volume>, <fpage>157</fpage>&#x02013;<lpage>166</lpage>. <pub-id pub-id-type="doi">10.1109/72.279181</pub-id><pub-id pub-id-type="pmid">18267787</pub-id></citation></ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bottou</surname> <given-names>L.</given-names></name> <name><surname>Curtis</surname> <given-names>F. E.</given-names></name> <name><surname>Nocedal</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <article-title>Optimization methods for large-scale machine learning</article-title>. <source>SIAM Rev.</source> <volume>60</volume>, <fpage>223</fpage>&#x02013;<lpage>311</lpage>. <pub-id pub-id-type="doi">10.1137/16M1080173</pub-id></citation>
</ref>
<ref id="B4">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Brooks</surname> <given-names>S.</given-names></name> <name><surname>Gelman</surname> <given-names>A.</given-names></name> <name><surname>Jones</surname> <given-names>G. L.</given-names></name> <name><surname>Meng</surname> <given-names>X.-L.</given-names></name></person-group> (<year>2011</year>). <source>Handbook of Markov Chain Monte Carlo</source>. <publisher-loc>Boca Raton, FL</publisher-loc>: <publisher-name>Chapman and Hall/CRC</publisher-name>. <pub-id pub-id-type="doi">10.1201/b10905</pub-id></citation>
</ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cao</surname> <given-names>X.</given-names></name> <name><surname>Rabusseau</surname> <given-names>G.</given-names></name> <name><surname>Pineau</surname> <given-names>J.</given-names></name></person-group> (<year>2017</year>). <article-title>Tensor regression networks with various low-rank tensor approximations</article-title>. <source>arXiv</source> [Preprint]. arxiv:1712.09520. <pub-id pub-id-type="doi">10.48550/arxiv.1712.09520</pub-id></citation>
</ref>
<ref id="B6">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Carroll</surname> <given-names>J. D.</given-names></name> <name><surname>Chang</surname> <given-names>J.-J.</given-names></name></person-group> (<year>1970</year>). <article-title>Analysis of individual differences in multidimensional scaling via an n-way generalization of &#x0201C;Eckart-Young&#x0201D; decomposition</article-title>. <source>Psychometrika</source> <volume>35</volume>, <fpage>283</fpage>&#x02013;<lpage>319</lpage>. <pub-id pub-id-type="doi">10.1007/BF02310791</pub-id></citation>
</ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chattopadhyay</surname> <given-names>A.</given-names></name> <name><surname>Hassanzadeh</surname> <given-names>P.</given-names></name> <name><surname>Pasha</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). <article-title>Predicting clustered weather patterns: a test case for applications of convolutional neural networks to spatio-temporal climate data</article-title>. <source>Sci. Rep</source>. <volume>10</volume>:<fpage>1317</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-020-57897-9</pub-id><pub-id pub-id-type="pmid">31992743</pub-id></citation></ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chien</surname> <given-names>J.-T.</given-names></name> <name><surname>Bao</surname> <given-names>Y.-T.</given-names></name></person-group> (<year>2018</year>). <article-title>Tensor-factorized neural networks</article-title>. <source>IEEE Trans. Neural Netw</source>. <volume>29</volume>, <fpage>1998</fpage>&#x02013;<lpage>2011</lpage>. <pub-id pub-id-type="doi">10.1109/TNNLS.2017.2690379</pub-id><pub-id pub-id-type="pmid">28436897</pub-id></citation></ref>
<ref id="B9">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cichocki</surname> <given-names>A.</given-names></name> <name><surname>Lee</surname> <given-names>N.</given-names></name> <name><surname>Oseledets</surname> <given-names>I.</given-names></name> <name><surname>Phan</surname> <given-names>A.-H.</given-names></name> <name><surname>Zhao</surname> <given-names>Q.</given-names></name> <name><surname>Mandic</surname> <given-names>D. P.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>Tensor networks for dimensionality reduction and large-scale optimization: part 1 low-rank tensor decompositions</article-title>. <source>Found. Trends Mach. Learn</source>. <volume>9</volume>, <fpage>249</fpage>&#x02013;<lpage>429</lpage>. <pub-id pub-id-type="doi">10.1561/2200000059</pub-id></citation>
</ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>de Lathauwer</surname> <given-names>L.</given-names></name> <name><surname>de Moor</surname> <given-names>B.</given-names></name> <name><surname>Vandewalle</surname> <given-names>J.</given-names></name></person-group> (<year>2000</year>). <article-title>A multilinear singular value decomposition</article-title>. <source>SIAM J. Matrix Anal. Appl</source>. <volume>21</volume>, <fpage>1253</fpage>&#x02013;<lpage>1278</lpage>. <pub-id pub-id-type="doi">10.1137/S0895479896305696</pub-id></citation>
</ref>
<ref id="B11">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Denil</surname> <given-names>M.</given-names></name> <name><surname>Shakibi</surname> <given-names>B.</given-names></name> <name><surname>Dinh</surname> <given-names>L.</given-names></name> <name><surname>Ranzato</surname> <given-names>M.</given-names></name> <name><surname>de Frietas</surname> <given-names>N.</given-names></name></person-group> (<year>2013</year>). <article-title>&#x0201C;Predicting parameters in deep learning,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems 26</source>, eds. C. J. C. Burges, L. Bottou, M. Welling, Z. Ghahramani, and K. Q. Weinberger (Curran Associates, Inc.), <fpage>2148</fpage>&#x02013;<lpage>2156</lpage>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://papers.nips.cc/paper/5025-predicting-parameters-in-deep-learning.pdf">http://papers.nips.cc/paper/5025-predicting-parameters-in-deep-learning.pdf</ext-link> (accessed December 18, 2023).</citation>
</ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ee</surname> <given-names>W.</given-names></name></person-group> (<year>2017</year>). <article-title>A proposal on machine learning via dynamical systems</article-title>. <source>Comm. Math. Stat</source>. <volume>5</volume>, <fpage>1</fpage>&#x02013;<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1007/s40304-017-0103-z</pub-id></citation>
</ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Haber</surname> <given-names>E.</given-names></name> <name><surname>Ruthotto</surname> <given-names>L.</given-names></name></person-group> (<year>2017</year>). <article-title>Stable architectures for deep neural networks</article-title>. <source>Inverse Probl</source>. <volume>34</volume>:<fpage>1</fpage>. <pub-id pub-id-type="doi">10.1088/1361-6420/aa9a90</pub-id></citation>
</ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Haber</surname> <given-names>E.</given-names></name> <name><surname>Ruthotto</surname> <given-names>L.</given-names></name> <name><surname>Holtham</surname> <given-names>E.</given-names></name> <name><surname>Jun</surname> <given-names>S.-H.</given-names></name></person-group> (<year>2018</year>). <article-title>Learning across scales&#x02013;multiscale methods for convolution neural networks</article-title>. <source>Proc. AAAI Conf. Artif. Intell</source>. <volume>32</volume>, <fpage>3142</fpage>&#x02013;<lpage>3148</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v32i1.11680</pub-id></citation>
</ref>
<ref id="B15">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hao</surname> <given-names>N.</given-names></name> <name><surname>Kilmer</surname> <given-names>M. E.</given-names></name> <name><surname>Braman</surname> <given-names>K.</given-names></name> <name><surname>Hoover</surname> <given-names>R. C.</given-names></name></person-group> (<year>2013</year>). <article-title>Facial recognition using tensor-tensor decompositions</article-title>. <source>SIAM J. Imaging Sci</source>. <volume>6</volume>, <fpage>437</fpage>&#x02013;<lpage>463</lpage>. <pub-id pub-id-type="doi">10.1137/110842570</pub-id></citation>
</ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Harshman</surname> <given-names>R. A.</given-names></name></person-group> (<year>1970</year>). <article-title>&#x0201C;Foundations of the parafac procedure: models and conditions for an &#x0201C;explanatory&#x0201D; multimodal factor analysis,&#x0201D; in</article-title> <source>UCLA Working Papers in Phonetics</source>, <volume>16</volume>, <fpage>1</fpage>&#x02013;<lpage>84</lpage>. (University Microfilms, Ann Arbor, Michigan, No. 10,085).</citation>
</ref>
<ref id="B17">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>He</surname> <given-names>K.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Ren</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Deep residual learning for image recognition,&#x0201D;</article-title> in <source>2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Las Vegas, NV</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>770</fpage>&#x02013;<lpage>778</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id></citation>
</ref>
<ref id="B18">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jagtap</surname> <given-names>A. D.</given-names></name> <name><surname>Shin</surname> <given-names>Y.</given-names></name> <name><surname>Kawaguchi</surname> <given-names>K.</given-names></name> <name><surname>Karniadakis</surname> <given-names>G. E.</given-names></name></person-group> (<year>2022</year>). <article-title>Deep kronecker neural networks: a general framework for neural networks with adaptive activation functions</article-title>. <source>Neurocomputing</source> <volume>468</volume>, <fpage>165</fpage>&#x02013;<lpage>180</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2021.10.036</pub-id></citation>
</ref>
<ref id="B19">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Keegan</surname> <given-names>K.</given-names></name> <name><surname>Vishwanath</surname> <given-names>T.</given-names></name> <name><surname>Xu</surname> <given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>A tensor SVD-based classification algorithm applied to fmri data</article-title>. <source>SIAM Undergrad. Res. Online</source> <volume>15</volume>, <fpage>270</fpage>&#x02013;<lpage>294</lpage>. <pub-id pub-id-type="doi">10.1137/21S1456522</pub-id></citation>
</ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kernfeld</surname> <given-names>E.</given-names></name> <name><surname>Kilmer</surname> <given-names>M.</given-names></name> <name><surname>Aeron</surname> <given-names>S.</given-names></name></person-group> (<year>2015</year>). <article-title>Tensor-tensor products with invertible linear transforms</article-title>. <source>Linear Algebra Appl</source>. <volume>485</volume>, <fpage>545</fpage>&#x02013;<lpage>570</lpage>. <pub-id pub-id-type="doi">10.1016/j.laa.2015.07.021</pub-id></citation>
</ref>
<ref id="B21">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kilmer</surname> <given-names>M. E.</given-names></name> <name><surname>Braman</surname> <given-names>K.</given-names></name> <name><surname>Hao</surname> <given-names>N.</given-names></name> <name><surname>Hoover</surname> <given-names>R. C.</given-names></name></person-group> (<year>2013</year>). <article-title>Third-order tensors as operators on matrices: a theoretical and computational framework with applications in imaging</article-title>. <source>SIAM J. Matrix Anal. Appl</source>. <volume>34</volume>, <fpage>148</fpage>&#x02013;<lpage>172</lpage>. <pub-id pub-id-type="doi">10.1137/110837711</pub-id></citation>
</ref>
<ref id="B22">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kilmer</surname> <given-names>M. E.</given-names></name> <name><surname>Horesh</surname> <given-names>L.</given-names></name> <name><surname>Avron</surname> <given-names>H.</given-names></name> <name><surname>Newman</surname> <given-names>E.</given-names></name></person-group> (<year>2021</year>). <article-title>Tensor-tensor algebra for optimal representation and compression of multiway data</article-title>. <source>Proc. Natl. Acad. Sci. USA</source>. <volume>118</volume>:<fpage>e2015851118</fpage>. <pub-id pub-id-type="doi">10.1073/pnas.2015851118</pub-id><pub-id pub-id-type="pmid">34234014</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kilmer</surname> <given-names>M. E.</given-names></name> <name><surname>Martin</surname> <given-names>C. D.</given-names></name></person-group> (<year>2011</year>). <article-title>Factorization strategies for third-order tensors</article-title>. <source>Linear Algebra Appl</source>. <volume>435</volume>, <fpage>641</fpage>&#x02013;<lpage>658</lpage>. <pub-id pub-id-type="doi">10.1016/j.laa.2010.09.020</pub-id></citation>
</ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kingma</surname> <given-names>D. P.</given-names></name> <name><surname>Ba</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;Adam: a method for stochastic optimization,&#x0201D;</article-title> in <source>3rd International Conference on Learning Representations, ICLR 2015, May 7-9, 2015, Conference Track Proceedings</source>, eds. Y. Bengio, and Y. LeCun (San Diego, CA).</citation>
</ref>
<ref id="B25">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kolda</surname> <given-names>T. G.</given-names></name> <name><surname>Bader</surname> <given-names>B. W.</given-names></name></person-group> (<year>2009</year>). <article-title>Tensor decompositions and applications</article-title>. <source>SIAM Rev</source>. <volume>51</volume>, <fpage>455</fpage>&#x02013;<lpage>500</lpage>. <pub-id pub-id-type="doi">10.1137/07070111X</pub-id></citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kossaifi</surname> <given-names>J.</given-names></name> <name><surname>Lipton</surname> <given-names>Z. C.</given-names></name> <name><surname>Kolbeinsson</surname> <given-names>A.</given-names></name> <name><surname>Khanna</surname> <given-names>A.</given-names></name> <name><surname>Furlanello</surname> <given-names>T.</given-names></name> <name><surname>Anandkumar</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Tensor regression networks</article-title>. <source>J. Mach. Learn. Res</source>. <volume>21</volume>, <fpage>1</fpage>&#x02013;<lpage>21</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1707.08308</pub-id></citation>
</ref>
<ref id="B27">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Krizhevsky</surname> <given-names>A.</given-names></name> <name><surname>Hinton</surname> <given-names>G.</given-names></name></person-group> (<year>2009</year>). <source>Learning multiple layers of features from tiny images</source>. Technical report, University of Toronto. Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.cs.toronto.edu/&#x0007E;kriz/learning-features-2009-TR.pdf">https://www.cs.toronto.edu/&#x0007E;kriz/learning-features-2009-TR.pdf</ext-link></citation>
</ref>
<ref id="B28">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Krizhevsky</surname> <given-names>A.</given-names></name> <name><surname>Sutskever</surname> <given-names>I.</given-names></name> <name><surname>Hinton</surname> <given-names>G. E.</given-names></name></person-group> (<year>2012</year>). <article-title>&#x0201C;ImageNet classification with deep convolutional neural networks,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems, Vol. 25</source>, eds. F. Pereira, C. J. Burges, L. Bottou, and K. Q. Weinberger (Red Hook, NY: Curran Associates, Inc.) 1091&#x02013;1105. Available online at: <ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf</ext-link></citation>
</ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kullback</surname> <given-names>S.</given-names></name> <name><surname>Leibler</surname> <given-names>R. A.</given-names></name></person-group> (<year>1951</year>). <article-title>On information and sufficiency</article-title>. <source>Ann. Math. Stat</source>. <volume>22</volume>, <fpage>79</fpage>&#x02013;<lpage>86</lpage>. <pub-id pub-id-type="doi">10.1214/aoms/1177729694</pub-id></citation>
</ref>
<ref id="B30">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>LeCun</surname> <given-names>Y.</given-names></name> <name><surname>Cortes</surname> <given-names>C.</given-names></name> <name><surname>Burges</surname> <given-names>C. J. C.</given-names></name></person-group> (<year>1998</year>). <source>The MNIST Database of Handwritten Digits.</source> New York, NY. Available online at: <ext-link ext-link-type="uri" xlink:href="http://yann.lecun.com/exdb/mnist/">http://yann.lecun.com/exdb/mnist/</ext-link></citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>LeCun</surname> <given-names>Y.</given-names></name> <name><surname>Denker</surname> <given-names>J.</given-names></name> <name><surname>Solla</surname> <given-names>S.</given-names></name></person-group> (<year>1989</year>). <article-title>&#x0201C;Optimal brain damage,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems, Volume 2</source>, ed. D. Touretzky (Cambridge, MA: Morgan-Kaufmann).</citation>
</ref>
<ref id="B32">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>D.</given-names></name> <name><surname>Yu</surname> <given-names>Z.</given-names></name> <name><surname>Wu</surname> <given-names>F.</given-names></name> <name><surname>Luo</surname> <given-names>W.</given-names></name> <name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Yuan</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>The tensor-based feature analysis of spatiotemporal field data with heterogeneity</article-title>. <source>Earth Space Sci</source>. <volume>7</volume>:<fpage>e2019E</fpage>A001037. <pub-id pub-id-type="doi">10.1029/2019EA001037</pub-id></citation>
</ref>
<ref id="B33">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lund</surname> <given-names>K.</given-names></name></person-group> (<year>2020</year>). <article-title>The tensor t- function: a definition for functions of third-order tensors</article-title>. <source>Numer. Linear Algebra Appl</source>. <volume>27</volume>:<fpage>e2288</fpage>. <pub-id pub-id-type="doi">10.1002/nla.2288</pub-id><pub-id pub-id-type="pmid">27534393</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>A.</given-names></name> <name><surname>Molitor</surname> <given-names>D.</given-names></name></person-group> (<year>2022</year>). <article-title>Randomized Kaczmarz for tensor linear systems</article-title>. <source>BIT Numer. Math</source>. <volume>62</volume>, <fpage>171</fpage>&#x02013;<lpage>194</lpage>. <pub-id pub-id-type="doi">10.1007/s10543-021-00877-w</pub-id></citation>
</ref>
<ref id="B35">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Malik</surname> <given-names>O. A.</given-names></name> <name><surname>Ubaru</surname> <given-names>S.</given-names></name> <name><surname>Horesh</surname> <given-names>L.</given-names></name> <name><surname>Kilmer</surname> <given-names>M. E.</given-names></name> <name><surname>Avron</surname> <given-names>H.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Dynamic graph convolutional networks using the tensor M-product,&#x0201D;</article-title> in <source>Proceedings of the 2021 SIAM International Conference on Data Mining (SDM)</source> (<publisher-loc>Philadelphia, PA</publisher-loc>), <fpage>729</fpage>&#x02013;<lpage>737</lpage>. <pub-id pub-id-type="doi">10.1137/1.9781611976700.82</pub-id></citation>
</ref>
<ref id="B36">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mor</surname> <given-names>U.</given-names></name> <name><surname>Cohen</surname> <given-names>Y.</given-names></name> <name><surname>Vald&#x000E9;s-Mas</surname> <given-names>R.</given-names></name> <name><surname>Kviatcovsky</surname> <given-names>D.</given-names></name> <name><surname>Elinav</surname> <given-names>E.</given-names></name> <name><surname>Avron</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Dimensionality reduction of longitudinal omics data using modern tensor factorizations</article-title>. <source>PLoS Comput. Biol</source>. <volume>18</volume>, <fpage>1</fpage>&#x02013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1010212</pub-id><pub-id pub-id-type="pmid">35839259</pub-id></citation></ref>
<ref id="B37">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Newman</surname> <given-names>E.</given-names></name></person-group> (<year>2019</year>). <source>A Step in the Right Dimension: Tensor Algebra and Applications</source> [PhD thesis]. Medford, MA: Tufts University.</citation>
</ref>
<ref id="B38">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Newman</surname> <given-names>E.</given-names></name> <name><surname>Kilmer</surname> <given-names>M.</given-names></name> <name><surname>Horesh</surname> <given-names>L.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Image classification using local tensor singular value decompositions,&#x0201D;</article-title> in <source>2017 IEEE 7th International Workshop on Computational Advances in Multi-Sensor Adaptive Processing (CAMSAP)</source> (<publisher-loc>Curacao</publisher-loc>: <publisher-name>IEEE</publisher-name>). <pub-id pub-id-type="doi">10.1109/CAMSAP.2017.8313137</pub-id><pub-id pub-id-type="pmid">27534393</pub-id></citation></ref>
<ref id="B39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Newman</surname> <given-names>E.</given-names></name> <name><surname>Kilmer</surname> <given-names>M. E.</given-names></name></person-group> (<year>2020</year>). <article-title>Nonnegative tensor patch dictionary approaches for image compression and deblurring applications</article-title>. <source>SIAM J. Imaging Sci</source>. <volume>13</volume>, <fpage>1084</fpage>&#x02013;<lpage>1112</lpage>. <pub-id pub-id-type="doi">10.1137/19M1297026</pub-id></citation>
</ref>
<ref id="B40">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Nielsen</surname> <given-names>M. A.</given-names></name></person-group> (<year>2018</year>). <source>Neural Networks and Deep Learning</source>. <publisher-loc>San Francisco, CA</publisher-loc>: <publisher-name>Determination Press</publisher-name>.</citation>
</ref>
<ref id="B41">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Novikov</surname> <given-names>A.</given-names></name> <name><surname>Podoprikhin</surname> <given-names>D.</given-names></name> <name><surname>Osokin</surname> <given-names>A.</given-names></name> <name><surname>Vetrov</surname> <given-names>D.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;Tensorizing neural networks,&#x0201D;</article-title> in <source>Advances in Neural Information Processing Systems 28</source>, eds. C. Cortes, N. D. Lawrence, D. D. Lee, M. Sugiyama, and R. Garnett (Curran Associates, Inc.), <fpage>442</fpage>&#x02013;<lpage>450</lpage>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://papers.nips.cc/paper/5787-tensorizing-neural-networks.pdf">http://papers.nips.cc/paper/5787-tensorizing-neural-networks.pdf</ext-link> (accessed December 20, 2023).</citation>
</ref>
<ref id="B42">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Omberg</surname> <given-names>L.</given-names></name> <name><surname>Golub</surname> <given-names>G. H.</given-names></name> <name><surname>Alter</surname> <given-names>O.</given-names></name></person-group> (<year>2007</year>). <article-title>A tensor higher-order singular value decomposition for integrative analysis of dna microarray data from different studies</article-title>. <source>Proc. Nat. Acad. Sci</source>. <volume>104</volume>, <fpage>18371</fpage>&#x02013;<lpage>18376</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.0709146104</pub-id><pub-id pub-id-type="pmid">18003902</pub-id></citation></ref>
<ref id="B43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Oseledets</surname> <given-names>I. V.</given-names></name></person-group> (<year>2011</year>). <article-title>Tensor-train decomposition</article-title>. <source>SIAM J. Sci. Comput</source>. <volume>33</volume>, <fpage>2295</fpage>&#x02013;<lpage>2317</lpage>. <pub-id pub-id-type="doi">10.1137/090752286</pub-id></citation>
</ref>
<ref id="B44">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Paszke</surname> <given-names>A.</given-names></name> <name><surname>Gross</surname> <given-names>S.</given-names></name> <name><surname>Chintala</surname> <given-names>S.</given-names></name> <name><surname>Chanan</surname> <given-names>G.</given-names></name> <name><surname>Yang</surname> <given-names>E.</given-names></name> <name><surname>DeVito</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>&#x0201C;Automatic differentiation in pytorch,&#x0201D;</article-title> in <source>NIPS-W</source> (<publisher-loc>Long Beach, CA</publisher-loc>).</citation>
</ref>
<ref id="B45">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Petersen</surname> <given-names>K. B.</given-names></name> <name><surname>Pedersen</surname> <given-names>M. S.</given-names></name></person-group> (<year>2012</year>). <source>The Matrix Cookbook</source>. <publisher-loc>Lyngby</publisher-loc>: <publisher-name>Technical University of Denmark</publisher-name>.</citation>
</ref>
<ref id="B46">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ronneberger</surname> <given-names>O.</given-names></name> <name><surname>Fischer</surname> <given-names>P.</given-names></name> <name><surname>Brox</surname> <given-names>T.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;U-net: convolutional networks for biomedical image segmentation,&#x0201D;</article-title> in <source>Medical Image Computing and Computer-Assisted Intervention-MICCAI 2015</source>, eds. N. Navab, J. Hornegger, W. M. Wells, and A. F. Frangi (Cham: Springer International Publishing), <fpage>234</fpage>&#x02013;<lpage>241</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-24574-4_28</pub-id></citation>
</ref>
<ref id="B47">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rumelhart</surname> <given-names>D. E.</given-names></name> <name><surname>Hinton</surname> <given-names>G. E.</given-names></name> <name><surname>Williams</surname> <given-names>R. J.</given-names></name></person-group> (<year>1986</year>). <article-title>Learning representations by back-propagating errors</article-title>. <source>Nature</source> <volume>323</volume>, <fpage>533</fpage>&#x02013;<lpage>536</lpage>. <pub-id pub-id-type="doi">10.1038/323533a0</pub-id><pub-id pub-id-type="pmid">37022259</pub-id></citation></ref>
<ref id="B48">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Shalev-Shwartz</surname> <given-names>S.</given-names></name> <name><surname>Shamir</surname> <given-names>O.</given-names></name> <name><surname>Shammah</surname> <given-names>S.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Failures of gradient-based deep learning,&#x0201D;</article-title> in <source>Proceedings of the 34th International Conference on Machine Learning</source>-<italic>Volume 70, ICML&#x00027;17</italic>, 3067&#x02013;3075. Available online at: <ext-link ext-link-type="uri" xlink:href="https://JMLR.org">https://JMLR.org</ext-link> (accessed December 12, 2023).<pub-id pub-id-type="pmid">37141866</pub-id></citation></ref>
<ref id="B49">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Skeel</surname> <given-names>R. D.</given-names></name></person-group> (<year>1993</year>). <article-title>Variable step size destabilizes the stromer/leapfrog/verlet method</article-title>. <source>BIT</source> <volume>33</volume>, <fpage>172</fpage>&#x02013;<lpage>175</lpage>. <pub-id pub-id-type="doi">10.1007/BF01990352</pub-id></citation>
</ref>
<ref id="B50">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Soltani</surname> <given-names>S.</given-names></name> <name><surname>Kilmer</surname> <given-names>M. E.</given-names></name> <name><surname>Hansen</surname> <given-names>P. C.</given-names></name></person-group> (<year>2016</year>). <article-title>A tensor-based dictionary learning approach to tomographic image reconstruction</article-title>. <source>BIT Numer. Math</source>. <volume>56</volume>, <fpage>1425</fpage>&#x02013;<lpage>1454</lpage>. <pub-id pub-id-type="doi">10.1007/s10543-016-0607-z</pub-id><pub-id pub-id-type="pmid">25779991</pub-id></citation></ref>
<ref id="B51">
<citation citation-type="web"><person-group person-group-type="author"><collab>TCA</collab></person-group> (<year>2023</year>). <source>Tufts community appeal</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://communityrelations.tufts.edu/engage-us/faculty-staff-students/tufts-community-appeal-tca">https://communityrelations.tufts.edu/engage-us/faculty-staff-students/tufts-community-appeal-tca</ext-link> (accessed December 20, 2023).</citation>
</ref>
<ref id="B52">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tucker</surname> <given-names>L. R.</given-names></name></person-group> (<year>1966</year>). <article-title>Some mathematical notes on three-mode factor analysis</article-title>. <source>Psychometrika</source> <volume>31</volume>, <fpage>279</fpage>&#x02013;<lpage>311</lpage>. <pub-id pub-id-type="doi">10.1007/BF02289464</pub-id><pub-id pub-id-type="pmid">5221127</pub-id></citation></ref>
<ref id="B53">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vasilescu</surname> <given-names>M. A. O.</given-names></name> <name><surname>Terzopoulos</surname> <given-names>D.</given-names></name></person-group> (<year>2002</year>). <article-title>&#x0201C;Multilinear analysis of image ensembles: tensorfaces,&#x0201D;</article-title> in <source>Computer Vision</source> &#x02013; <italic>ECCV 2002</italic>, eds. A. Heyden, G. Sparr, M. Nielsen, and P. Johansen (Berlin: Springer Berlin Heidelberg), <fpage>447</fpage>&#x02013;<lpage>460</lpage>. <pub-id pub-id-type="doi">10.1007/3-540-47969-4_30</pub-id></citation>
</ref>
<ref id="B54">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>M.</given-names></name> <name><surname>Pan</surname> <given-names>Y.</given-names></name> <name><surname>Xu</surname> <given-names>Z.</given-names></name> <name><surname>Yang</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>G.</given-names></name> <name><surname>Cichocki</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2023</year>). <source>Tensor networks meet neural networks: A survey and future perspectives</source>. arXiv, Cornell University, Ithaca, NY.</citation>
</ref>
<ref id="B55">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Ely</surname> <given-names>G.</given-names></name> <name><surname>Aeron</surname> <given-names>S.</given-names></name> <name><surname>Hao</surname> <given-names>N.</given-names></name> <name><surname>Kilmer</surname> <given-names>M.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;Novel methods for multilinear data completion and denoising based on tensor-SVD,&#x0201D;</article-title> in <source>2014 IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Columbus, OH</publisher-loc>: <publisher-name>IEEE</publisher-name>). <pub-id pub-id-type="doi">10.1109/CVPR.2014.485</pub-id></citation>
</ref>
</ref-list>
</back>
</article> 
