<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Comms. Net</journal-id>
<journal-title>Frontiers in Communications and Networks</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Comms. Net</abbrev-journal-title>
<issn pub-type="epub">2673-530X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">734402</article-id>
<article-id pub-id-type="doi">10.3389/frcmn.2021.734402</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Communications and Networks</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Federated Learning for Audio Semantic Communication</article-title>
<alt-title alt-title-type="left-running-head">Tong et&#x20;al.</alt-title>
<alt-title alt-title-type="right-running-head">Federated Learning for Audio Semantic Communication</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Tong</surname>
<given-names>Haonan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1103400/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yang</surname>
<given-names>Zhaohui</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/980863/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Sihua</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1422989/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hu</surname>
<given-names>Ye</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1458262/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Semiari</surname>
<given-names>Omid</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1394176/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Saad</surname>
<given-names>Walid</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1458257/overview"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Yin</surname>
<given-names>Changchuan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1427127/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<label>
<sup>1</sup>
</label>Beijing University of Posts and Telecommunications (BUPT), <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<label>
<sup>2</sup>
</label>University College London, <addr-line>London</addr-line>, <country>United&#x20;Kingdom</country>
</aff>
<aff id="aff3">
<label>
<sup>3</sup>
</label>Virginia Tech, <addr-line>Blacksburg</addr-line>, <addr-line>VA</addr-line>, <country>United&#x20;States</country>
</aff>
<aff id="aff4">
<label>
<sup>4</sup>
</label>University of Colorado Colorado Springs, <addr-line>Colorado Springs</addr-line>, <addr-line>CO</addr-line>, <country>United&#x20;States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/968022/overview">Yuanming Shi</ext-link>, ShanghaiTech University, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1002987/overview">Ahmed Imteaj</ext-link>, Florida International University, United&#x20;States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1035429/overview">Jiawen Kang</ext-link>, Nanyang Technological University, Singapore</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Changchuan Yin, <email>ccyin@bupt.edu.cn</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Data Science for Communications, a section of the journal Frontiers in Communications and Networks</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>10</day>
<month>09</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>2</volume>
<elocation-id>734402</elocation-id>
<history>
<date date-type="received">
<day>01</day>
<month>07</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>08</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2021 Tong, Yang, Wang, Hu, Semiari, Saad and Yin.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Tong, Yang, Wang, Hu, Semiari, Saad and Yin</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these&#x20;terms.</p>
</license>
</permissions>
<abstract>
<p>In this paper, the problem of audio semantic communication over wireless networks is investigated. In the considered model, wireless edge devices transmit large-sized audio data to a server using semantic communication techniques. The techniques allow devices to only transmit audio semantic information that captures the contextual features of audio signals. To extract the semantic information from audio signals, a wave to vector (wav2vec) architecture based autoencoder is proposed, which consists of convolutional neural networks (CNNs). The proposed autoencoder enables high-accuracy audio transmission with small amounts of data. To further improve the accuracy of semantic information extraction, federated learning (FL) is implemented over multiple devices and a server. Simulation results show that the proposed algorithm can converge effectively and can reduce the mean squared error (MSE) of audio transmission by nearly 100 times, compared to a traditional coding scheme.</p>
</abstract>
<kwd-group>
<kwd>federated learning</kwd>
<kwd>audio communication</kwd>
<kwd>semantic communication</kwd>
<kwd>autoencoder</kwd>
<kwd>wireless network</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Future wireless networks require high data rate and massive connection for emerging applications such as the Internet of Things (IoT) (<xref ref-type="bibr" rid="B31">Saad et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B22">Lee et&#x20;al., 2017</xref>; <xref ref-type="bibr" rid="B15">Hu et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B1">Al-Garadi et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B16">Huang et&#x20;al., 2021</xref>). In particular, in human-computer interaction scenarios, humans may simultaneously control multiple IoT devices using speech, thus making audio communication pervasive in wireless local area network such as smart home. However, due to bandwidth constrains, the wireless network in smart home may not be able to support a broad and prolonged wireless audio communication. This, in turn, motivates the development of semantic communication techniques that allow devices to only transmit semantic information. Semantic communication aims at minimizing the difference between the meanings of the transmitted messages and that of the recovered messages, rather than the recovered symbols. The advantage of such an approach is that semantic communication transmits less amounts of data than traditional communication techniques. However, despite recent interest in semantic communications (<xref ref-type="bibr" rid="B13">Guler et&#x20;al., 2018</xref>; <xref ref-type="bibr" rid="B35">Shi et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B40">Xie et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B37">Uysal et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B39">Xie and Qin, 2021</xref>), there is still a lack of reliable encoder and decoder models for audio semantic communication (ASC).</p>
<p>Existing works in <xref ref-type="bibr" rid="B33">Shannon, (1948)</xref>, <xref ref-type="bibr" rid="B3">Bao et&#x20;al. (2011)</xref>, <xref ref-type="bibr" rid="B13">Guler et&#x20;al. (2018)</xref>, <xref ref-type="bibr" rid="B35">Shi et&#x20;al. (2020)</xref>, <xref ref-type="bibr" rid="B37">Uysal et&#x20;al. (2021)</xref>, <xref ref-type="bibr" rid="B40">Xie et&#x20;al. (2020)</xref>, <xref ref-type="bibr" rid="B39">Xie and Qin (2021)</xref> studied the important problems related to semantic communications. In <xref ref-type="bibr" rid="B33">Shannon (1948)</xref>, the authors pointed out that semantic communication should consider higher-level information such as content or semantic-related information rather than relying only on data-oriented metrics such as data rate or bit error probability. To efficiently transmit information, the work in <xref ref-type="bibr" rid="B3">Bao et&#x20;al. (2011)</xref> investigated a model-based approach for semantic data compression and showed that classical source and channel coding theorems have semantic counterparts. Furthermore, the authors in <xref ref-type="bibr" rid="B13">Guler et&#x20;al. (2018)</xref> proposed Bayesian game theory to design the transmission policies for transceivers and minimize the end-to-end average semantic metric while capturing the expected error between the meanings of intended and recovered messages. Besides, the authors in <xref ref-type="bibr" rid="B35">Shi et&#x20;al. (2020)</xref> proposed a semantic-aware network architecture to reduce the required communication bandwidth and significantly improve the communication efficiency. In <xref ref-type="bibr" rid="B37">Uysal et&#x20;al. (2021)</xref>, the authors defined a semantic based network system to reduce the data traffic and the energy consumption, hence increasing the wireless devices that can be supported. The work in <xref ref-type="bibr" rid="B40">Xie et&#x20;al. (2020)</xref> proposed a deep learning (DL) based text semantic communication system to reduce wireless traffic load. Meanwhile, in <xref ref-type="bibr" rid="B39">Xie and Qin (2021)</xref>, the authors developed a new distributed text semantic communication system for IoT devices and they showed that nearly 20&#x20;times compression ratio can be achieved without any performance degradation. However, most of these existing works (<xref ref-type="bibr" rid="B33">Shannon, 1948</xref>; <xref ref-type="bibr" rid="B3">Bao et&#x20;al., 2011</xref>; <xref ref-type="bibr" rid="B35">Shi et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B13">Guler et&#x20;al., 2018</xref>; <xref ref-type="bibr" rid="B37">Uysal et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B40">Xie et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B39">Xie and Qin, 2021</xref>) that focused on the use of semantic communication for text data processing did not consider how to extract the meaning out of the audio data. Here, we note that audio data is completely different from text data since audio signals have a very high temporal resolution, at least 16,000 samples per second (<xref ref-type="bibr" rid="B19">Jurafsky and Martin, 2009</xref>).</p>
<p>The prior art in <xref ref-type="bibr" rid="B19">Jurafsky and Martin (2009)</xref>, <xref ref-type="bibr" rid="B32">Schneider et&#x20;al. (2019)</xref>, <xref ref-type="bibr" rid="B2">Amodei et&#x20;al. (2016)</xref>, <xref ref-type="bibr" rid="B28">Oord et&#x20;al. (2016)</xref> studied the problem of audio feature extraction. In <xref ref-type="bibr" rid="B19">Jurafsky and Martin (2009)</xref>, the authors adopted the so-called Mel-frequency cepstral coefficients (MFCC) features to represent the characteristics of audio signals. However, MFCC features are extracted only in a frequency domain, which lacks the contextual relation mining of audio sequence data. Recently, the works in <xref ref-type="bibr" rid="B32">Schneider et&#x20;al. (2019)</xref>, <xref ref-type="bibr" rid="B2">Amodei et&#x20;al. (2016)</xref>, <xref ref-type="bibr" rid="B28">Oord et&#x20;al. (2016)</xref> used DL based natural language processing (NLP) models to extract audio semantic features. In particular, the authors in (<xref ref-type="bibr" rid="B32">Schneider et&#x20;al., 2019</xref>) proposed a wave to vector (wav2vec) architecture to effectively extract semantic information. The authors in <xref ref-type="bibr" rid="B2">Amodei et&#x20;al. (2016)</xref> proposed an end-to-end model that recognizes various language speeches. In <xref ref-type="bibr" rid="B28">Oord et&#x20;al. (2016)</xref>, the authors proposed a speech generator which can generate speech audio signals with different styles using wave data. However, the works in <xref ref-type="bibr" rid="B32">Schneider et&#x20;al. (2019)</xref>, <xref ref-type="bibr" rid="B2">Amodei et&#x20;al. (2016)</xref>, <xref ref-type="bibr" rid="B28">Oord et&#x20;al. (2016)</xref> did not account for the impact of the channel noise on the transmitted data. Meanwhile, the work in <xref ref-type="bibr" rid="B28">Oord et&#x20;al. (2016)</xref> did not proposed any method to generate the audio signals from the transmitted semantic information.</p>
<p>The use of federated learning (FL) in edge networks was studied in <xref ref-type="bibr" rid="B4">Bonawitz et&#x20;al. (2019)</xref>, <xref ref-type="bibr" rid="B36">Tran et&#x20;al. (2019)</xref>, <xref ref-type="bibr" rid="B5">Chen et&#x20;al. (2021a)</xref>, <xref ref-type="bibr" rid="B8">Chen et&#x20;al. (2020)</xref>, <xref ref-type="bibr" rid="B41">Yang K. et&#x20;al. (2020)</xref>, <xref ref-type="bibr" rid="B18">Imteaj et&#x20;al. (2021)</xref>, <xref ref-type="bibr" rid="B23">Li et&#x20;al. (2020)</xref>, <xref ref-type="bibr" rid="B17">Imteaj and Amini (2019)</xref>, <xref ref-type="bibr" rid="B6">Chen et&#x20;al., (2021b)</xref>, <xref ref-type="bibr" rid="B42">Yang Z. et&#x20;al. (2020)</xref>, <xref ref-type="bibr" rid="B20">Kang et&#x20;al. (2019)</xref>. In <xref ref-type="bibr" rid="B4">Bonawitz et&#x20;al. (2019)</xref>, <xref ref-type="bibr" rid="B36">Tran et&#x20;al. (2019)</xref>, the authors introduced FL method to generate a global model through collaboratively learning from multiple edge devices, thus learning a distributed algorithm without sharing datasets. The work in <xref ref-type="bibr" rid="B7">Chen et&#x20;al. (2021c)</xref> proposed an FL framework in wireless networks and jointly considered wireless resource allocation and user selection while optimizing FL learning performance. To accelerate the convergence of FL, the authors in <xref ref-type="bibr" rid="B8">Chen et&#x20;al. (2020)</xref> proposed a probabilistic user selection scheme to enhance the efficiency of model aggregation, thus improving convergence speed and the FL training loss. Besides, the authors in <xref ref-type="bibr" rid="B41">Yang K. et&#x20;al. (2020)</xref> introduced over-the-air computation for fast global model aggregation which is realized using superposition property of a wireless multiple-access channel. To explore the applications of FL, the works in <xref ref-type="bibr" rid="B18">Imteaj et&#x20;al. (2021)</xref>, <xref ref-type="bibr" rid="B23">Li et&#x20;al. (2020)</xref>, <xref ref-type="bibr" rid="B17">Imteaj and Amini (2019)</xref>, <xref ref-type="bibr" rid="B5">Chen et&#x20;al. (2021a)</xref> provided comprehensive summaries on FL deployed on IoT devices. Besides, the work in <xref ref-type="bibr" rid="B42">Yang Z. et&#x20;al. (2020)</xref> proposed an energy-efficient scheme to minimize the FL energy consumption and complete time, where closed-form solutions of wireless resource allocation are derived. In <xref ref-type="bibr" rid="B20">Kang et&#x20;al. (2019)</xref>, the authors proposed efficient incentive mechanisms for FL to improve the learning security and accuracy, which used blockchain based reputation with contract theory. However, most of the above works (<xref ref-type="bibr" rid="B4">Bonawitz et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B36">Tran et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B6">Chen et&#x20;al., 2021b</xref>; <xref ref-type="bibr" rid="B8">Chen et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B41">Yang K. et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B18">Imteaj et&#x20;al., 2021</xref>; <xref ref-type="bibr" rid="B23">Li et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B17">Imteaj and Amini, 2019</xref>; <xref ref-type="bibr" rid="B7">Chen et&#x20;al., 2021c</xref>; <xref ref-type="bibr" rid="B42">Yang Z. et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B20">Kang et&#x20;al., 2019</xref>) studied the prediction models which ignored the impact of FL on the performance of semantic communication.</p>
<p>The main contribution of this paper is a novel semantic communication model for audio communication, which is trained via federated learning (FL). Our key contributions include:<list list-type="simple">
<list-item>
<p>&#x2022; We develop a realistic implementation of an ASC system in which wireless devices transmit large audio command data to a server. For the considered system, the bandwidth for audio data transmission is limited and, thus, semantic information is extracted and transmitted to overcome this limitation. To further improve the accuracy of semantic information extraction, the semantic extraction model must learn from multiple devices. Hence, FL is introduced to train the model with reducing the communication overhead of sharing training data. We formulate this audio communication problem as a signal recovery problem whose goal is to minimize the mean squared error (MSE) between the recovered audio signals and the source audio signals.</p>
</list-item>
<list-item>
<p>&#x2022; To solve this problem, we propose a wav2vec based autoencoder that uses flexible convolutional neural networks (CNNs) to extract semantic information from source audio signals. The autoencoder consists of an encoder and a decoder. The encoder perceives and encodes temporal features of audio signals into semantic information, which is transmitted over an imperfect wireless channel with noise. Then, the decoder decodes the received semantic information and recovers the audio signals while alleviating channel noise. In this way, the proposed autoencoder transmits less data while jointly designing the source coding and channel coding in the autoencoder.</p>
</list-item>
<list-item>
<p>&#x2022; To improve the accuracy of semantic information extraction, FL is implemented to collaboratively train the autoencoder over multiple devices and the server. In each FL training period, each local model is first trained with the audio data from the local device. Then, the parameters of the local models are transmitted to the server. Finally, the server aggregates the collected local models into a global model and broadcasts the global model to all the devices participated in the FL. Thus, the proposed autoencoder can integrate more audio features from multiple users and, hence, improve the accuracy of semantic information extraction.</p>
</list-item>
<list-item>
<p>&#x2022; We perform fundamental analysis on the noise immunity and convergence of the proposed autoencoder. We theoretically show that the number of semantic features, time domain downsampling rate, and FL training method&#x20;can significantly influence performance of the autoencoder.</p>
</list-item>
</list>
</p>
<p>Simulation results show that the proposed algorithm can effectively converge and reduce the MSE between the recovered and the source audio signals by nearly 100 times, compared to a traditional coding scheme. To our best knowledge, this is the first work that studies the ASC model and uses FL to improve model performance, while avoiding the need for sharing training&#x20;data.</p>
<p>The rest of this paper is organized as follows. The system model and problem formulation are discussed in <italic>System Model and Problem Formulation</italic>. In <italic>Audio Semantic Encoder and Decoder</italic>, we provide a detailed description of the proposed audio semantic encoder and decoder. The simulation results are presented and analyzed in <italic>Simulation and Performance Analysis</italic>. Finally, conclusions are drawn in <italic>Conclusion</italic>.</p>
</sec>
<sec id="s2">
<title>2 System Model and Problem Formulation </title>
<p>We consider a spectrum resource-limited uplink wireless network to deploy an ASC system, which consists of <italic>U</italic> edge devices, <italic>B</italic> base stations (BSs), and one server. Each edge device will transmit large audio packets to the server via the closest BS, as shown in <xref ref-type="fig" rid="F1">Figure&#x20;1</xref>. Due to the limited spectrum, audio semantic information must be extracted for data transmission, thus reducing communication overhead and improving the spectrum efficiency. In particular, edge devices must send audio semantic information <italic>via</italic> wireless channels to the BSs, and, then, the semantic information is delivered via optical links to the server for decoding. To extract the audio semantic information with high efficiency and accuracy, we assume that the edge devices and the server cooperatively train an ASC model using FL. The ASC model consists of an ASC encoder and an ASC decoder, as shown in <xref ref-type="fig" rid="F2">Figure&#x20;2</xref>. In particular, the ASC encoder is deployed on each edge device to extract audio semantic information while the ASC decoder is deployed on the server to recover audio signals. The objective of the ASC model is to recover the audio signals as accurate as possible. We assume that the connections between BSs and the server use optical links and have sufficient spectrum resource to support accurate transmission. We mainly consider the transmission impairments from the wireless channel between the edge devices and&#x20;BSs.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>The architecture of an FL based ASC system over wireless networks.</p>
</caption>
<graphic xlink:href="frcmn-02-734402-g001.tif"/>
</fig>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>The architecture of audio semantic communication (ASC).</p>
</caption>
<graphic xlink:href="frcmn-02-734402-g002.tif"/>
</fig>
<p>To enhance noise immunity, the ASC model must be trained using the received semantic information while taking into account the wireless channel impairments. Hence, the BSs are set to reliably send back the received semantic information to each device, which only occurs during the short-term training stage. Since the extraction of semantic information determines the accuracy of ASC, we consider the architecture design of the ASC model for audio communications.</p>
<sec id="s2-1">
<title>2.1 ASC Encoder</title>
<p>The ASC encoder is used to encode the input audio data and to extract the semantic information from the raw audio data. We define <inline-formula id="inf1">
<mml:math id="m1">
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> as the raw audio data vector where each element <italic>a</italic>
<sub>
<italic>t</italic>
</sub> is the audio data in sample <italic>t</italic> with <italic>T</italic> being the number of samples. Let <inline-formula id="inf2">
<mml:math id="m2">
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> be the semantic information vector to be transmitted where <italic>x</italic>
<sub>
<italic>n</italic>
</sub> is element <italic>n</italic> in the vector. The ASC encoder extracts <bold>
<italic>x</italic>
</bold> from <bold>
<italic>a</italic>
</bold> by using a neural network (NN) model parameterized by <bold>
<italic>&#x3b8;</italic>
</bold>, thus, the relationship between <bold>
<italic>a</italic>
</bold> and <bold>
<italic>x</italic>
</bold> can be given by:<disp-formula id="e1">
<mml:math id="m3">
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:math>
<label>(1)</label>
</disp-formula>where <bold>
<italic>T</italic>
</bold>
<sub>
<bold>
<italic>&#x3b8;</italic>
</bold>
</sub>(&#x22c5;) indicates the function of the ASC encoder.</p>
</sec>
<sec id="s2-2">
<title>2.2 Wireless Channel</title>
<p>When transmitted over a wireless channel, semantic information will experience channel fading and noise. We assume that the audio transmission uses a single wireless link and, hence, the transmitted signal will be given by:<disp-formula id="e2">
<mml:math id="m4">
<mml:mi mathvariant="bold-italic">y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>h</mml:mi>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3c3;</mml:mi>
<mml:mo>,</mml:mo>
</mml:math>
<label>(2)</label>
</disp-formula>where <bold>
<italic>y</italic>
</bold> is the received semantic information at the decoder with transmission impairments, <italic>h</italic> is the channel coefficient, and <inline-formula id="inf3">
<mml:math id="m5">
<mml:mi mathvariant="bold-italic">&#x3c3;</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mi mathvariant="script">N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mi mathvariant="bold-italic">I</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> is a Gaussian channel noise at the receiver with variance <italic>&#x3c3;</italic>
<sup>2</sup>. <bold>
<italic>I</italic>
</bold> is the identity matrix.</p>
</sec>
<sec id="s2-3">
<title>2.3 ASC Decoder</title>
<p>The ASC decoder is used to recover the audio data <bold>
<italic>a</italic>
</bold> from the received semantic information <bold>
<italic>y</italic>
</bold> and to alleviate transmission impairments. The functions of the decoder and the encoder are generally reciprocal. Let <inline-formula id="inf4">
<mml:math id="m6">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> be the decoded audio data and <bold>
<italic>&#x3c6;</italic>
</bold> be the parameters of the NN model in the ASC decoder. Then the relationship between <inline-formula id="inf5">
<mml:math id="m7">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> and <bold>
<italic>y</italic>
</bold> can be given by:<disp-formula id="e3">
<mml:math id="m8">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3c6;</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:math>
<label>(3)</label>
</disp-formula>where <bold>
<italic>R</italic>
</bold>
<sub>
<bold>
<italic>&#x3c6;</italic>
</bold>
</sub>(&#x22c5;) indicates the function of the ASC decoder.</p>
</sec>
<sec id="s2-4">
<title>2.4 ASC Objective</title>
<p>The objective of the ASC system is to recover the audio signals as accurate as possible. Since ASC system transmits semantic information, the use of bit error rate (BER) as a metric is not suitable to assess ASC. Hence, we use the mean squared error (MSE) to evaluate the quality of ASC at the semantic level. The ASC system objective function can be formulated to minimize the MSE between <bold>
<italic>a</italic>
</bold> and <inline-formula id="inf6">
<mml:math id="m9">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, as follows:<disp-formula id="e4">
<mml:math id="m10">
<mml:munder>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3c6;</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mspace width="0.3333em"/>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>MSE</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3c6;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3c6;</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mspace width="0.3333em"/>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:munderover accentunder="false" accent="false">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:math>
<label>(4)</label>
</disp-formula>where <bold>
<italic>&#x3b8;</italic>
</bold> and <bold>
<italic>&#x3c6;</italic>
</bold> are the parameters of the ASC encoder and ASC decoder, respectively. Here, we assume that the architectures of <bold>
<italic>T</italic>
</bold>
<sub>
<bold>
<italic>&#x3b8;</italic>
</bold>
</sub> and <bold>
<italic>R</italic>
</bold>
<sub>
<bold>
<italic>&#x3c6;</italic>
</bold>
</sub> are stay fixed and we only update the weights of NNs when solving problem <xref ref-type="disp-formula" rid="e4">Equation 4</xref>. Hence, it is necessary to properly design the architecture of the ASC encoder and the ASC decoder. To this end, we introduce an autoencoder to extract audio semantic information.</p>
</sec>
</sec>
<sec id="s3">
<title>3 Audio Semantic Encoder and Decoder</title>
<p>To solve problem (<xref ref-type="disp-formula" rid="e4">Eq. 4</xref>), we first propose a wav2vec architecture based autoencoder to efficiently extract audio information. Then, to further improve the accuracy of semantic information extraction, the autoencoder is trained with FL over multiple devices and the server. Thus, the proposed autoencoder can learn semantic information extraction from the audio information of diverse&#x20;users.</p>
<sec id="s3-1">
<title>3.1 Wav2vec Architecture Based Autoencoder</title>
<p>In the proposed architecture, as shown in <xref ref-type="fig" rid="F2">Figure&#x20;2</xref>, the ASC system can be interpreted as an autoencoder (<xref ref-type="bibr" rid="B27">O&#x2019;Shea and Hoydis, 2017</xref>; <xref ref-type="bibr" rid="B11">Goodfellow et&#x20;al., 2016</xref>; <xref ref-type="bibr" rid="B25">Lu et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B9">D&#xf6;rner et&#x20;al., 2018</xref>), This autoencoder is trained to recover the input signals at the output end using compressed data features. Since the data must pass through each layer of the autoencoder, the&#x20;autoencoder must find a robust representation of the input data at each layer (<xref ref-type="bibr" rid="B25">Lu et&#x20;al., 2020</xref>). In particular, NN models are used to build each layer in the autoencoder. Since CNNs are particularly good at extracting features and can be parallel deployed over time on multiple devices, we prefer to use CNNs instead of other NNs such as recurrent neural networks (<xref ref-type="bibr" rid="B34">Shewalkar et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B14">Hori et&#x20;al., 2018</xref>; <xref ref-type="bibr" rid="B12">Graves et&#x20;al., 2013</xref>). Next, we introduce our CNN-based wav2vec architecture for semantic information extraction.</p>
<p>To extract the semantic information, we use a wav2vec model as the audio semantic encoder. A simplification of our wav2vec architecture is shown in <xref ref-type="fig" rid="F3">Figure&#x20;3</xref>. From <xref ref-type="fig" rid="F3">Figure&#x20;3</xref>, we see that, the wav2vec architecture uses two cascaded CNNs, called feature extractor and feature aggregator (<xref ref-type="bibr" rid="B32">Schneider et&#x20;al., 2019</xref>), to extract audio semantic information. Given the raw audio vector, the extractor refines rough audio features and the aggregator combines the rough audio features into a higher-level latent variable that contains semantic relations among contextual audio features (<xref ref-type="bibr" rid="B32">Schneider et&#x20;al., 2019</xref>).</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>The wav2vec architecture.</p>
</caption>
<graphic xlink:href="frcmn-02-734402-g003.tif"/>
</fig>
<p>According to the wav2vec architecture, we design an audio semantic decoder, whose network architecture is symmetrical to the original wav2vec model (<xref ref-type="bibr" rid="B32">Schneider et&#x20;al., 2019</xref>). Combining together an audio semantic encoder and the corresponding semantic decoder, we propose a wav2vec based autoencoder as shown in <xref ref-type="fig" rid="F4">Figure&#x20;4</xref>. In the autoencoder, the audio semantic encoder and the decoder extracts the semantic information and recovers audio signals from the semantic information, respectively. Each single encoder or decoder implements the function of joint source coding and channel coding. Considering the transmission impairments, the semantic information is designed to accurately capture the time domain contextual relations of the audio signals, so as to resist channel fading and noise interference.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Data shape in the proposed autoencoder over ASC system.</p>
</caption>
<graphic xlink:href="frcmn-02-734402-g004.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F5">Figure&#x20;5</xref> shows the NN layers of the proposed autoencoder. According to <xref ref-type="fig" rid="F5">Figure&#x20;5</xref>, we observe that, given the raw audio signals <bold>
<italic>a</italic>
</bold>, the audio semantic encoder is used to extract the semantic vector <bold>
<italic>x</italic>
</bold>. In the proposed audio semantic encoder, the data first passes through a feature extractor then a feature aggregator. The feature extractor and the aggregator consist of <italic>L</italic>
<sub>ext</sub> and <italic>L</italic>
<sub>agg</sub> convolution blocks, respectively. In particular, each convolution block consists of 1) a convolution layer, 2) a dropout layer, and 3) a batch normalization layer, defined as follows:<list list-type="simple">
<list-item>
<p>&#x2022; Convolutional Layer: In CNNs, a convolutional layer is used to extract the spatial correlation of the input data with 1-D convolution between the input data <bold>
<italic>Z</italic>
</bold>
<sup>
<italic>l</italic>&#x2212;1</sup> and the kernel matrix. Mathematically, given the input <inline-formula id="inf7">
<mml:math id="m11">
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, the output of the convolutional layer <italic>l</italic> is <inline-formula id="inf8">
<mml:math id="m12">
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, where <inline-formula id="inf9">
<mml:math id="m13">
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> is the feature map <italic>m</italic> of convolutional layer <italic>l</italic> with <italic>M</italic>
<sup>
<italic>l</italic>
</sup> being the number of output features. Hence, the input <bold>
<italic>Z</italic>
</bold>
<sup>0</sup> of convolutional layer 1 is the raw audio data or the output of the last NN module. The output of feature map <bold>
<italic>z</italic>
</bold>
<sup>
<italic>l</italic>,<italic>m</italic>
</sup> in each convolutional layer <italic>l</italic> is given by:</p>
</list-item>
</list>
</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>The architecture of the proposed autoencoder.</p>
</caption>
<graphic xlink:href="frcmn-02-734402-g005.tif"/>
</fig>
<disp-formula id="e5">
<mml:math id="m14">
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:munderover accentunder="false" accent="false">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:munderover>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2297;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:math>
<label>(5)</label>
</disp-formula>
<p>where <italic>f</italic>(<italic>x</italic>) &#x3d; <italic>x</italic> is the linear activation function, <italic>M</italic>
<sup>
<italic>l</italic>&#x2212;1</sup> is the number of feature maps in the last convolutional layer <italic>l</italic>&#x2014;1, &#x2297; denotes 1-D convolution operation, and <inline-formula id="inf10">
<mml:math id="m15">
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> and <inline-formula id="inf11">
<mml:math id="m16">
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> are convolution kernels and bias vector of feature map <italic>m</italic> in convolutional layer <italic>l</italic>, respectively, with <italic>s</italic>
<sub>
<italic>k</italic>
</sub> being the kernel size. Let the convolution stride be <italic>s</italic>
<sub>
<italic>c</italic>
</sub>, the padding size be <italic>p</italic>, and the size of feature map&#x20;<italic>&#x3bb;</italic>
<sub>
<italic>l</italic>
</sub>&#x20;satisfies <inline-formula id="inf12">
<mml:math id="m17">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>&#x230A;</mml:mo>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>p</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
<mml:mo>&#x230B;</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:math>
</inline-formula>.<list list-type="simple">
<list-item>
<p>&#x2022; Dropout Layer: The input <bold>
<italic>Z</italic>
</bold>
<sup>
<italic>l</italic>
</sup> of a dropout layer <italic>l</italic> is the output of convolutional layer <italic>l</italic>. In the training stage, the dropout layer randomly abandons the effect of each neuron with a probability called dropout rate, and, in the inference stage, the dropout layer counts on the effects of all neurons. The dropout layer is used as a regularization approach to avoid overfitting problem.</p>
</list-item>
<list-item>
<p>&#x2022; Batch Normalization Layer: A batch normalization layer normalizes the values of activated neurons to avoid gradient vanishing. We define <italic>&#x3b1;</italic>
<sub>
<italic>i</italic>
</sub> as the value of the activated neuron <italic>i</italic> in convolution block <italic>l</italic>. The normalized value <inline-formula id="inf13">
<mml:math id="m18">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> of the neuron is given by <inline-formula id="inf14">
<mml:math id="m19">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="script">B</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="script">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3f5;</mml:mi>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>, where <inline-formula id="inf15">
<mml:math id="m20">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="script">B</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, <inline-formula id="inf16">
<mml:math id="m21">
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="script">B</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bc;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="script">B</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, and <italic>&#x3f5;</italic> is a positive constant.</p>
</list-item>
</list>
</p>
<p>Since the amplitude of an audio signal is limited, &#x2009; tanh(&#x22c5;) is introduced as the activation function of the output layer in the feature extractor (<xref ref-type="bibr" rid="B28">Oord et&#x20;al., 2016</xref>), where <inline-formula id="inf17">
<mml:math id="m22">
<mml:mi>tanh</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:math>
</inline-formula>. To shape the transmitted semantic information with an adequate amplitude, the last layer of the feature aggregator is set as batch normalization layer without activation function (<xref ref-type="bibr" rid="B9">D&#xf6;rner et&#x20;al., 2018</xref>).</p>
<p>In the proposed audio semantic decoder, as shown in <xref ref-type="fig" rid="F5">Figure&#x20;5</xref>, the received semantic information first passes through a feature decomposer then an audio generator. Different from the encoder, a deconvolution operation is introduced to build the feature decomposer and audio generator which consist of <italic>L</italic>
<sub>de</sub> and <italic>L</italic>
<sub>gen</sub> deconvolution blocks, respectively. Correspondingly, each deconvolution block consists of 1) one deconvolution layer, 2) one dropout layer, and 3) one batch normalization layer. Mathematically, the processes of the dropout layer and the batch normalization layer are similar to those in the convolution blocks, except for the deconvolution&#x20;layer.</p>
<p>In the deconvolution layer, the feature matrix is first uniformly filled with zeros in each column. Given the filled input matrix <inline-formula id="inf18">
<mml:math id="m23">
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1,1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, the output of a deconvolution layer <italic>l</italic> is <inline-formula id="inf19">
<mml:math id="m24">
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">Z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, where <inline-formula id="inf20">
<mml:math id="m25">
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> is the filled feature map <italic>m</italic> and <inline-formula id="inf21">
<mml:math id="m26">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> is the filled feature map size. The output of feature map <bold>
<italic>z</italic>
</bold>
<sup>
<italic>l</italic>,<italic>m</italic>
</sup> in each convolutional layer <italic>l</italic> is given by:<disp-formula id="e6">
<mml:math id="m27">
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:munderover accentunder="false" accent="false">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:munderover>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">z</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2297;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:math>
<label>(6)</label>
</disp-formula>where <italic>M</italic>
<sup>
<italic>l</italic>&#x2212;1</sup> is the number of features of layer <italic>l</italic>&#x2212;1, and <inline-formula id="inf22">
<mml:math id="m28">
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> and <inline-formula id="inf23">
<mml:math id="m29">
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> are deconvolution kernels and bias vector in deconvolutional layer <italic>l</italic>, respectively. In deconvolution layer, the filled feature map size <inline-formula id="inf24">
<mml:math id="m30">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> satisfies <inline-formula id="inf25">
<mml:math id="m31">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>p</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:math>
</inline-formula> and the size of feature map <italic>&#x3bb;</italic>
<sub>
<italic>l</italic>
</sub> satisfies <inline-formula id="inf26">
<mml:math id="m32">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mi>p</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>, where <italic>p</italic> is the padding size of layer <italic>l</italic>. Note that, to appropriately recover the audio signals, the output layer of the audio generator is set as tanh(&#x22c5;) function.</p>
<p>To amplify the inference error and avoid gradient vanishing, we introduces the normalized root mean squared error (NRMSE) for the autoencoder. Then the objective of the autoencoder is given by:<disp-formula id="e7">
<mml:math id="m33">
<mml:munder>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3c6;</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mspace width="0.3333em"/>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>NRMSE</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3c6;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">&#x3c6;</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mspace width="0.3333em"/>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>
<statement content-type="algorithm" id="alg1">
<label>Algorithm 1</label>
<p> Local model training algorithm of the autoencoder.</p>
<p>
<inline-graphic xlink:href="frcmn-02-734402-fx1.tif"/>
</p>
</statement>
</p>
</sec>
<sec id="s3-5">
<title>3.2 FL Training Method</title>
<p>Next, our goal is to minimize the errors between the recovered audio signals and the source audio signals using FL training method. In FL, the server and the devices collaboratively learn the proposed autoencoder by sharing the model parameters (<xref ref-type="bibr" rid="B24">Liu et&#x20;al., 2020</xref>; <xref ref-type="bibr" rid="B5">Chen et&#x20;al., 2021a</xref>; <xref ref-type="bibr" rid="B6">Chen et&#x20;al., 2021b</xref>; <xref ref-type="bibr" rid="B43">Yang et&#x20;al., 2021</xref>). We define <bold>
<italic>w</italic>
</bold> &#x3d; (<bold>
<italic>&#x3b8;</italic>
</bold>, <bold>
<italic>&#x3c6;</italic>
</bold>) as the total parameter of the proposed autoencoder, which includes both the encoder and decoder. The server generates a global model <bold>
<italic>w</italic>
</bold>
<sup>
<bold>
<italic>g</italic>
</bold>
</sup> and each device <italic>i</italic> locally trains a local autoencoder model <bold>
<italic>w</italic>
</bold>
<sub>
<italic>i</italic>
</sub> which shares the same architecture as <bold>
<italic>w</italic>
</bold>
<sup>
<bold>
<italic>g</italic>
</bold>
</sup>, as shown in <xref ref-type="fig" rid="F1">Figure&#x20;1</xref>. The global model periodically aggregates local models from <italic>U</italic> devices that participate in FL and broadcasts the aggregated global model back to the devices. Then the aggregated global model can be given by <inline-formula id="inf27">
<mml:math id="m34">
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>g</mml:mtext>
</mml:mrow>
</mml:msup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>. We use <bold>
<italic>A</italic>
</bold>
<sub>
<italic>i</italic>
</sub> to capture the audio dataset of local model <italic>i</italic>. According to problem <xref ref-type="disp-formula" rid="e7">Eq. 7</xref>, the objective of FL training method is given by:<disp-formula id="e9">
<mml:math id="m35">
<mml:munder>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mi mathvariant="bold-italic">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>g</mml:mtext>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:munder>
<mml:mspace width="0.3333em"/>
<mml:munderover accentunder="false" accent="false">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="script">L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mtext>NRMSE</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">w</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
<mml:mo>.</mml:mo>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
<p>
<statement content-type="algorithm" id="alg2">
<label>Algorithm 2</label>
<p> FL training algorithm of the global model (<xref ref-type="bibr" rid="B18">Imteaj et&#x20;al., 2021</xref>).</p>
<p>
<inline-graphic xlink:href="frcmn-02-734402-fx2.tif"/>
</p>
<p>&#x2001;During the local model training stage, the server first defines the architecture of the autoencoder and broadcasts it to all edge devices to randomly initialize the local models. To keep the coordination between the encoder and the decoder of the proposed autoencoder, we jointly set that the encoder and the decoder update the parameters simultaneously to minimize the loss function <xref ref-type="disp-formula" rid="e9">Eq. 9</xref>. Hence, both the encoder and decoder update the parameters with stochastic gradient descent (SGD) once after a batch of data passes through the autoencoder.</p>
<p>The training process of each local model can be shown in <xref ref-type="other" rid="alg1">Algorithm 1</xref>, where <italic>&#x3b7;</italic> in (8) is the learning&#x20;rate. During the training process of the global model, each edge device is set to transmit the parameters of the local models <bold>
<italic>w</italic>
</bold>
<sub>
<italic>i</italic>
</sub> to the server every a fixed number of epochs. Thus, the server periodically collects the transmitted models, aggregates the parameters of the local models, and then broadcasts the updated global model to each device. In the next period, the local models update their parameters through training from local datasets <bold>
<italic>A</italic>
</bold>
<sub>
<italic>i</italic>
</sub>, before transmitting <bold>
<italic>w</italic>
</bold>
<sub>
<italic>i</italic>
</sub> to the server, as shown in <xref ref-type="other" rid="alg1">Algorithm 1</xref>. The FL algorithm for the global model is summarized in <xref ref-type="other" rid="alg2">Algorithm&#x20;2</xref>.</p>
</statement>
</p>
</sec>
<sec id="s3-6">
<title>3.3 Complexity Analysis</title>
<p>The proposed FL algorithm used to solve problem <xref ref-type="disp-formula" rid="e9">Eq. 9</xref> is summarized in <xref ref-type="other" rid="alg2">Algorithm 2</xref>. The complexity of the proposed algorithm lies in training the proposed autoencoder. The complexity for training the autoencoder is <inline-formula id="inf28">
<mml:math id="m36">
<mml:mi mathvariant="script">O</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B38">Wang et&#x20;al., 2020</xref>), where <italic>L</italic>&#x20;&#x3d; <italic>L</italic>
<sub>ext</sub> &#x2b; <italic>L</italic>
<sub>agg</sub> &#x2b; <italic>L</italic>
<sub>de</sub> &#x2b; <italic>L</italic>
<sub>gen</sub>, with <italic>L</italic>
<sub>ext</sub>, <italic>L</italic>
<sub>agg</sub>, <italic>L</italic>
<sub>de</sub>, <italic>L</italic>
<sub>gen</sub>, and <italic>L</italic> being the number of convolution or deconvolution layers in the feature extractor, the feature aggregator, the feature decomposer, the audio generator, and the proposed autoencoder, respectively. Let <italic>L</italic>
<sub>
<italic>o</italic>
</sub> be the number of model aggregations until the FL global model converges. The complexity of the FL training method is <inline-formula id="inf29">
<mml:math id="m37">
<mml:mi mathvariant="script">O</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>U</mml:mi>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:math>
</inline-formula> (<xref ref-type="bibr" rid="B8">Chen et&#x20;al., 2020</xref>). In consequence, the major complexity of training the autoencoder, which depends on the number of NN layers, the kernel sizes and the numbers of features in each layer, is linear. Meanwhile, since the layers in the autoencoder are finite, the local training is achievable and, hence the edge devices can support the FL training in the considered wireless network. Once the training process is completed, the trained autoencoder can be used for ASC in a long term period.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Simulation and Performance Analysis</title>
<p>To evaluate the proposed autoencoder, we train the model using a training set from the speech dataset Librispeech (<xref ref-type="bibr" rid="B29">Panayotov et&#x20;al., 2015</xref>), which contains 1,000&#xa0;h of 16&#xa0;kHz read English speech. The learning rate <italic>&#x3b7;</italic> is 10<sup>&#x2013;5</sup>. The proposed autoencoder is trained under additive white Gaussian noise (AWGN) channels with a fixed channel coefficient <italic>h</italic> and a 6dB signal-to-noise-ratio (SNR), and it is tested on 200,000 samples of speech data. The simulation parameters are listed in <xref ref-type="table" rid="T1">Table&#x20;1</xref> (<xref ref-type="bibr" rid="B21">Kang et&#x20;al., 2020</xref>). We train the model using FL method with 1 global model and 2 local models of user 1 and user 2, each local model is trained using read speech from a single person, and the FL models are tested with read speech of another user 3. The global model aggregates local models every 10 local training epochs.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Simulation parameters.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Module</th>
<th align="center">Setting</th>
<th align="center">Parameter</th>
<th align="center">Value</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">feature extractor</td>
<td align="center">
<italic>L</italic>
<sub>ext</sub> &#x3d; 3</td>
<td align="left">feature <italic>M</italic>
<sup>
<italic>l</italic>
</sup>
</td>
<td align="center">8,8,8</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="left">kernel size <italic>s</italic>
<sub>
<italic>k</italic>
</sub>
</td>
<td align="center">1,2,4</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="left">stride <italic>s</italic>
<sub>
<italic>c</italic>
</sub>
</td>
<td align="center">1,1,1</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="left">dropout rate</td>
<td align="center">0.5</td>
</tr>
<tr>
<td align="left">feature aggregator</td>
<td align="center">
<italic>L</italic>
<sub>agg</sub> &#x3d; 4</td>
<td align="left">feature <italic>M</italic>
<sup>
<italic>l</italic>
</sup>
</td>
<td align="center">8,8,8,8</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="left">kernel size <italic>s</italic>
<sub>
<italic>k</italic>
</sub>
</td>
<td align="center">2,4,8,16</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="left">stride <italic>s</italic>
<sub>
<italic>c</italic>
</sub>
</td>
<td align="center">1,1,1,1</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="left">dropout rate</td>
<td align="center">0.5</td>
</tr>
<tr>
<td align="left">feature decomposer</td>
<td align="center">
<italic>L</italic>
<sub>de</sub> &#x3d; 4</td>
<td align="left">feature <italic>M</italic>
<sup>
<italic>l</italic>
</sup>
</td>
<td align="center">8,8,8,8</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="left">kernel size <italic>s</italic>
<sub>
<italic>k</italic>
</sub>
</td>
<td align="center">2,4,8,16</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="left">stride <italic>s</italic>
<sub>
<italic>c</italic>
</sub>
</td>
<td align="center">1,1,1,1</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="left">dropout rate</td>
<td align="center">0.5</td>
</tr>
<tr>
<td align="left">audio generator</td>
<td align="center">
<italic>L</italic>
<sub>gen</sub> &#x3d; 4</td>
<td align="left">feature <italic>M</italic>
<sup>
<italic>l</italic>
</sup>
</td>
<td align="center">8,8,8,1</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="left">kernel size <italic>s</italic>
<sub>
<italic>k</italic>
</sub>
</td>
<td align="center">1,2,4,1</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="left">stride <italic>s</italic>
<sub>
<italic>c</italic>
</sub>
</td>
<td align="center">1,1,1,1</td>
</tr>
<tr>
<td align="left"/>
<td align="left"/>
<td align="left">dropout rate</td>
<td align="center">0.5</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>For comparison purposes, we simulate a baseline scheme for high-quality audio transmission, which uses 128 kbps pulse code modulation (PCM) with 8 bits quantization levels (<xref ref-type="bibr" rid="B26">Nakano et&#x20;al., 1982</xref>) for source coding, low-density parity-check codes (LDPC) (<xref ref-type="bibr" rid="B10">Gallager, 1962</xref>) for channel coding, and 64-QAM (<xref ref-type="bibr" rid="B30">Pfau et&#x20;al., 2009</xref>) for modulation. In this section, for notational convenience, we call the proposed autoencoder for ASC a &#x201c;semantic method,&#x201d; and we call the baseline scheme a &#x201c;traditional method&#x201d;. Note that, the autoencoder is trained <italic>via</italic> NRMSE, and tested <italic>via</italic> MSE. This is because that NRMSE induces larger gradient for training the autoencoder and MSE provides more obvious fluctuations for result comparison. To verify the performance of the proposed FL algorithm, we compare two baselines: transfer learning method and local gradient descent FL (<xref ref-type="bibr" rid="B18">Imteaj et&#x20;al., 2021</xref>). In the transfer learning method, the feature aggregator and the decomposer in the autoencoder are first initialized with a pre-trained model, then the autoencoder is trained using local audio data. In the local gradient descent FL, at the start of each iteration, all devices first share an aggregated model, then each device simultaneously computes a fixed number of local gradient descent updates (1,000 steps) in parallel.</p>
<p>
<xref ref-type="fig" rid="F6">Figure&#x20;6</xref> shows examples of the raw audio data, the extracted semantic information reshaped in block form, the received semantic information, and the recovered audio data in one local&#x20;model. From <xref ref-type="fig" rid="F6">Figures 6A&#x2013;C</xref>, we see that, the audio semantic information signals are amplified by the proposed semantic encoder before being transmitted through the channel. From <xref ref-type="fig" rid="F6">Figure&#x20;6B</xref>, it is also observed that, the extracted eight different blocks of semantic features have correlations. From <xref ref-type="fig" rid="F6">Figures 6C,D</xref>, we see that the proposed semantic decoder eliminates the channel noise from the received signals. The elimination of the noise is due to the fact that the semantic decoder relieves the noise using multiple semantic features. <xref ref-type="fig" rid="F6">Figure&#x20;6</xref> shows that the proposed autoencoder can effectively guarantee the accuracy of&#x20;ASC.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Visualizations of a raw audio fragment, the corresponding semantic information that is reshaped in the block form, the received semantic information, and the recovered audio signals.</p>
</caption>
<graphic xlink:href="frcmn-02-734402-g006.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F7">Figure&#x20;7</xref> shows how transmission MSE of a local model using semantic method changes as the number of features varies. From <xref ref-type="fig" rid="F7">Figure&#x20;7</xref>, we see that, as the number of features increases, the MSE of the proposed semantic method decreases first and, then remains unchanged. This phenomenon is due to the fact that higher dimension features provide better semantic representations thus improving the transmission performance of the semantic method. From <xref ref-type="fig" rid="F7">Figure&#x20;7</xref>, we can also see that, when the number of features is larger than 16, the MSE of the semantic method tends to be leveling off. This result is because of the existence of redundant semantic features which provide limited noise immunity for&#x20;ASC.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Transmission MSE of a local autoencoder model as the number of features varies, in AWGN channels with a 6dB SNR.</p>
</caption>
<graphic xlink:href="frcmn-02-734402-g007.tif"/>
</fig>
<p>In <xref ref-type="fig" rid="F8">Figure&#x20;8</xref>, we show how the transmission MSE of a local&#x20;model using the proposed semantic method, BER and MSE of the traditional method change as the channel SNR varies. In this simulation, the semantic method reduces communication overhead by decreasing nearly 1/3 of the transmission data amount compared to the traditional method. From <xref ref-type="fig" rid="F8">Figure&#x20;8</xref>, we observe that, as the channel SNR increases, the error of communication decreases as expected. From <xref ref-type="fig" rid="F8">Figure&#x20;8</xref>, we can also see that our semantic method reduces the transmission MSE by nearly 100 times, compared to the traditional method, and the MSE of semantic method varies flatter than that of traditional method. The improvement is due to the fact that the semantic method has a better transmission accuracy and noise immunity performance. From <xref ref-type="fig" rid="F8">Figure&#x20;8</xref>, we can also see that, the MSE of the traditional method remains unchanged when the SNR is larger than 14&#xa0;dB. The phenomenon is because, for a lower BER, the accuracy of the traditional coding scheme will reach the coding limit and, hence, the MSE will stay at a quantization error level caused by PCM quantization.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Transmission MSE of a local model using semantic method, BER and transmission MSE of traditional method as SNR varies.</p>
</caption>
<graphic xlink:href="frcmn-02-734402-g008.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F9">Figure&#x20;9</xref> shows how the transmission MSE changes versus various channel SNR, where the semantic method uses different time domain downsampling rates. In this simulation, lower time domain downsampling rates can reduce transmission data amount exponentially and are realized by changing the convolution strides in the feature extractor and feature decomposer. From <xref ref-type="fig" rid="F9">Figure&#x20;9</xref>, we can see that, a lower time domain downsampling rate leads to more transmission error, which is because of the more loss of semantic information. From <xref ref-type="fig" rid="F9">Figure&#x20;9</xref>, we can also observe that as the SNR increases, the decreasing speed of the MSE differs among different downsampling rates. The disparity is due to the fact that, the semantic information extracted with different downsampling rates has diverse sensitivities to the SNR. <xref ref-type="fig" rid="F9">Figure&#x20;9</xref> shows that reducing the time domain sampling rates decreases the communication accuracy. In consequence, <xref ref-type="fig" rid="F7">Figure&#x20;7</xref> and <xref ref-type="fig" rid="F9">Figure&#x20;9</xref> demonstrate that, in terms of improving the performance of semantic communication, the complexity of semantic features trades off the data compression&#x20;rate.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Transmission MSE of the proposed semantic method with different time domain downsampling rates. The number of semantic features is 8.</p>
</caption>
<graphic xlink:href="frcmn-02-734402-g009.tif"/>
</fig>
<p>In <xref ref-type="fig" rid="F10">Figure&#x20;10</xref>, we show how the validation loss changes as the training epoch increases. From <xref ref-type="fig" rid="F10">Figure&#x20;10</xref>, we observe that, the validation loss initially decreases with fluctuation first and then remains unchanged. The fact that the validation loss remains unchanged demonstrates that the FL algorithm converges. From <xref ref-type="fig" rid="F10">Figure&#x20;10</xref>, we can see that, when the FL global model is aggregated, the loss of local models increases for several epochs first and, then decreases in a long-term view. The result is due to the difference of the multiple local audio datasets from different users. At the beginning of training, the aggregation of multiple local models will critically change the parameter distribution of the global model. Then, as the training process continues, the global model parameters fit multiple local datasets. Hence the fluctuation caused by FL model aggregation weakens, and the local models of multiple users converge. From <xref ref-type="fig" rid="F10">Figure&#x20;10</xref>, we can also see that FL model aggregation further decreases the lower bound of loss in each local model. This phenomenon is because that FL training method aggregates audio semantic features from multiple users, thus enhancing model performance compared with local training method.</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Convergence results of the proposed FL models.</p>
</caption>
<graphic xlink:href="frcmn-02-734402-g010.tif"/>
</fig>
<p>
<xref ref-type="fig" rid="F11">Figure&#x20;11</xref> shows how the transmission MSE of all algorithms changes as the channel SNR varies. From <xref ref-type="fig" rid="F11">Figure&#x20;11</xref>, we observe that the performance of the proposed model differs among the diverse users due to the various audio characteristics. We can also see that transfer learning can improve the model&#x20;performance compared to locally training. Besides, local gradient descent FL outperforms part, but not all of the locally trained models. The difference of the baselines is&#x20;because that transfer learning can further learn audio semantic extraction based on pre-trained model parameters. Whilst local gradient descent FL aggregates the global model&#x20;with low frequency, where the difference among local models leads to the inefficiency on improving semantic extraction. From <xref ref-type="fig" rid="F11">Figure&#x20;11</xref>, we can also see that, the proposed FL algorithm outperforms the locally trained models. The superiority is because that the FL trained model aggregates audio characteristics of all users and hence obtaining more robust performance. We can also observe from the dotted lines that the proposed FL training method is superior over transfer learning and local gradient descent FL. The superiority is due to the fact that the proposed FL algorithm aggregates the model in a frequent and synchronous way, which guarantees a more accurate semantic extraction than that of the baselines.</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Transmission MSE of FL trained model, locally trained models, transfer learning and local gradient descent FL.</p>
</caption>
<graphic xlink:href="frcmn-02-734402-g011.tif"/>
</fig>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>In this paper, we have developed an FL trained model over an ASC architecture in the wireless network. We have considered avoidance of training data sharing and heavy communication overhead of the large-sized audio transmission between edge devices and the server. To solve this problem, we have proposed a wav2vec based autoencoder to effectively encode, transmit, and decode audio semantic information, rather than traditional bits or symbols, to reduce communication overhead. Then, the autoencoder is trained with FL to improve the accuracy of semantic information extraction. Simulation results have shown that the proposed algorithm can converge effectively and yields significant reduction on transmission error compared to existing coding scheme which uses PCM, LDPC and 64-QAM algorithm.</p>
</sec>
</body>
<back>
<sec id="s6">
<title>Data Availability Statement</title>
<p>The original contributions presented in the study are included in the article/supplementary files, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s7">
<title>Author Contributions</title>
<p>HT: formulate the audio semantic communication system, use the learning approach, and do the simulation results. ZY: help proof reading the whole paper and provide suggestions about the structure of the paper. SW: help checking the simulation results and debug the codes. YH: provide suggestions about the flow chart and write the theoretical analysis. OS: do the literature review and identify the key novelty of this paper. WS: provide the idea about audio semantic communication and guide the writing. CY: polish the language of the whole paper and check all the formulations.</p>
</sec>
<sec id="s8">
<title>Funding</title>
<p>This work was supported in part by Beijing Natural Science Foundation and Municipal Education Committee Joint Funding Project under Grant KZ201911232046, in part by the National Natural Science Foundation of China under Grants 61671086 and 61629101, in part by the 111 Project under Grant B17007, in part by U.S. National Science Foundation (NSF) under Grants CNS-2007635 and CNS-2008646, and in part by BUPT Excellent Ph.D. Students Foundation under Grant CX2021114.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s10" sec-type="disclaimer">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Al-Garadi</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Mohamed</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Al-Ali</surname>
<given-names>A. K.</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ali</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Guizani</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A Survey of Machine and Deep Learning Methods for Internet of Things (IoT) Security</article-title>. <source>IEEE Commun. Surv. Tutorials</source> <volume>22</volume> (<issue>3</issue>), <fpage>1646</fpage>&#x2013;<lpage>1685</lpage>. <pub-id pub-id-type="doi">10.1109/COMST.2020.2988293</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Amodei</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ananthanarayanan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Anubhai</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Battenberg</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Case</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Casper</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Catanzaro</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep Speech 2: End-To-End Speech Recognition in English and Mandarin</article-title>,&#x201d; in <conf-name>Proc. of International Conference on Machine Learning</conf-name> (<publisher-loc>NY, USA</publisher-loc>: <publisher-name>ICML</publisher-name>), <fpage>173</fpage>&#x2013;<lpage>182</lpage>. </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Basu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Dean</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Partridge</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Swami</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Leland</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Towards a Theory of Semantic Communication</article-title>. <source>Proc. IEEE Netw. Sci. Workshop</source> <volume>2011</volume>, <fpage>110</fpage>&#x2013;<lpage>117</lpage>. <pub-id pub-id-type="doi">10.1109/nsw.2011.6004632</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bonawitz</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Eichner</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Grieskamp</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Huba</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ingerman</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ivanov</surname>
<given-names>V.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Towards Federated Learning at Scale: System Design</article-title>. <source>Arxiv, Vol. abs/1902.01046</source>. <pub-id pub-id-type="doi">10.1109/TWC.2020.3042530</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>G&#xfc;nd&#xfc;z</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Saad</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Bennis</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Feljan</surname>
<given-names>A. V.</given-names>
</name>
<etal/>
</person-group> (<year>2021a</year>). <article-title>Distributed Learning in Wireless Networks: Recent Progress and Future Challenges</article-title>. <source>arXiv:2104.02151</source>. <comment>[Online]. Available: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2104.02151">http://arxiv.org/abs/2104.02151</ext-link>.</comment> </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Shlezinger</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Poor</surname>
<given-names>H. V.</given-names>
</name>
<name>
<surname>Eldar</surname>
<given-names>Y. C.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021b</year>). <article-title>Communication-efficient Federated Learning</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>118</volume> (<issue>17</issue>), <fpage>e2024789118</fpage>. <pub-id pub-id-type="doi">10.1073/pnas.2024789118</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Saad</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Poor</surname>
<given-names>H. V.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021c</year>). <article-title>A Joint Learning and Communications Framework for Federated Learning over Wireless Networks</article-title>. <source>IEEE Trans. Wireless Commun.</source> <volume>20</volume> (<issue>1</issue>), <fpage>269</fpage>&#x2013;<lpage>283</lpage>. <pub-id pub-id-type="doi">10.1109/twc.2020.3024629</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Vincent Poor</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Saad</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Convergence Time Optimization for Federated Learning over Wireless Networks</article-title>. <source>IEEE Trans. Wireless Commun.</source> <volume>20</volume> (<issue>4</issue>), <fpage>2457</fpage>&#x2013;<lpage>2471</lpage>. <pub-id pub-id-type="doi">10.1109/TWC.2020.3042530</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>D&#xf6;rner</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Cammerer</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hoydis</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Brink</surname>
<given-names>S. T.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Deep Learning Based Communication over the Air</article-title>. <source>IEEE J.&#x20;Sel. Top. Signal. Process.</source> <volume>12</volume> (<issue>1</issue>), <fpage>132</fpage>&#x2013;<lpage>143</lpage>. <pub-id pub-id-type="doi">10.1109/jstsp.2017.2784180</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gallager</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>1962</year>). <article-title>Low-density Parity-Check Codes</article-title>. <source>IEEE Trans. Inform. Theor.</source> <volume>8</volume> (<issue>1</issue>), <fpage>21</fpage>&#x2013;<lpage>28</lpage>. <pub-id pub-id-type="doi">10.1109/tit.1962.1057683</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Goodfellow</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Bengio</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Courville</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). <source>Deep Learning</source>. <publisher-loc>Cambridge, MA, USA</publisher-loc>: <publisher-name>MIT Press</publisher-name>.</citation>
</ref>
<ref id="B12">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Graves</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mohamed</surname>
<given-names>A.-r.</given-names>
</name>
<name>
<surname>Hinton</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2013</year>). &#x201c;<article-title>Speech Recognition with Deep Recurrent Neural Networks</article-title>,&#x201d; in <conf-name>Proc. IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name> (<publisher-loc>Vancouver, BC, Canada</publisher-loc>: <publisher-name>ICASSP</publisher-name>). <pub-id pub-id-type="doi">10.1109/icassp.2013.6638947</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guler</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Yener</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Swami</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>The Semantic Communication Game</article-title>. <source>IEEE Trans. Cogn. Commun. Netw.</source> <volume>4</volume> (<issue>4</issue>), <fpage>787</fpage>&#x2013;<lpage>802</lpage>. <pub-id pub-id-type="doi">10.1109/tccn.2018.2872596</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hori</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Cho</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Watanabe</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>End-to-end Speech Recognition with Word-Based RNN Language Models</article-title>,&#x201d; in <conf-name>Proc. IEEE Spoken Language&#x20;Technology Workshop</conf-name> (<publisher-loc>Athens, Greece</publisher-loc>: <publisher-name>SLT</publisher-name>), <fpage>389</fpage>&#x2013;<lpage>396</lpage>. <pub-id pub-id-type="doi">10.1109/slt.2018.8639693</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Saad</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Poor</surname>
<given-names>H. V.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Distributed Multi-Agent Meta Learning for Trajectory Design in Wireless Drone Networks</article-title>. <source>IEEE J.&#x20;Selected Areas Commun.</source> <volume>PP</volume> (<issue>99</issue>), <fpage>1</fpage>. <pub-id pub-id-type="doi">10.1109/jsac.2021.3088689</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Alexandropoulos</surname>
<given-names>G. C.</given-names>
</name>
<name>
<surname>Xiong</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yuen</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Multi-hop RIS-Empowered Terahertz Communications: A DRL-Based Hybrid Beamforming Design</article-title>. <source>IEEE J.&#x20;Selected Areas Commun.</source> <pub-id pub-id-type="doi">10.1109/jsac.2021.3071836</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Imteaj</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Amini</surname>
<given-names>M. H.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Distributed Sensing Using Smart End-User Devices: Pathway to Federated Learning for Autonomous IoT</article-title>,&#x201d; in <conf-name>Proc. International Conference on Computational Science and Computational Intelligence</conf-name> (<publisher-loc>Las Vegas, NV, USA</publisher-loc>: <publisher-name>CSCI</publisher-name>). </citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Imteaj</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Thakker</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Amini</surname>
<given-names>M. H.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A Survey on Federated Learning for Resource-Constrained IoT Devices</article-title>. <source>IEEE Internet Things J.</source> <pub-id pub-id-type="doi">10.1109/jiot.2021.3095077</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Jurafsky</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Martin</surname>
<given-names>J.&#x20;H.</given-names>
</name>
</person-group> (<year>2009</year>). <source>Speech and Language Processing</source>. <publisher-loc>NJ, USA</publisher-loc>: <publisher-name>Prentice-Hall</publisher-name>.</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Xiong</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Niyato</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Incentive Mechanism for Reliable Federated Learning: A Joint Optimization Approach to Combining Reputation and Contract Theory</article-title>. <source>IEEE Internet Things J.</source> <volume>6</volume> (<issue>6</issue>), <fpage>10&#x2009;700</fpage>&#x2013;<lpage>710&#x2009;714</lpage>. <pub-id pub-id-type="doi">10.1109/jiot.2019.2940820</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Xiong</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Niyato</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Guizani</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Reliable Federated Learning for mobile Networks</article-title>. <source>IEEE Wireless Commun.</source> <volume>27</volume> (<issue>2</issue>), <fpage>72</fpage>&#x2013;<lpage>80</lpage>. <pub-id pub-id-type="doi">10.1109/mwc.001.1900119</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Bae</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Future of IoT Networks: A Survey</article-title>. <source>Appl. Sci.</source> <volume>7</volume> (<issue>10</issue>), <fpage>1072</fpage>. <pub-id pub-id-type="doi">10.3390/app7101072</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Sahu</surname>
<given-names>A. K.</given-names>
</name>
<name>
<surname>Talwalkar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Smith</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Federated Learning: Challenges, Methods, and Future Directions</article-title>. <source>IEEE Signal. Process. Mag.</source> <volume>37</volume> (<issue>3</issue>), <fpage>50</fpage>&#x2013;<lpage>60</lpage>. <pub-id pub-id-type="doi">10.1109/msp.2020.2975749</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yuan</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Xiong</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Niyato</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Federated Learning for 6G Communications: Challenges, Methods, and Future Directions</article-title>. <source>China Commun.</source> <volume>17</volume> (<issue>9</issue>), <fpage>105</fpage>&#x2013;<lpage>118</lpage>. <pub-id pub-id-type="doi">10.23919/jcc.2020.09.009</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Mow</surname>
<given-names>W. H.</given-names>
</name>
<name>
<surname>Vucetic</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deep Autoencoder Learning for Relay-Assisted Cooperative Communication Systems</article-title>. <source>IEEE Trans. Commun.</source> <volume>68</volume> (<issue>9</issue>), <fpage>5471</fpage>&#x2013;<lpage>5488</lpage>. <pub-id pub-id-type="doi">10.1109/tcomm.2020.2998538</pub-id> </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nakano</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Moriwaki</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Takahashi</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Akagiri</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Morio</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>1982</year>). <article-title>A New 8-bit Pcm Audio Recording Technique Using an Extension of the Video Track</article-title>. <source>IEEE Trans. Consumer Electron.</source> <volume>CE-28</volume> (<issue>3</issue>), <fpage>241</fpage>&#x2013;<lpage>249</lpage>. <pub-id pub-id-type="doi">10.1109/tce.1982.353917</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Oord</surname>
<given-names>A. V. D.</given-names>
</name>
<name>
<surname>Dieleman</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zen</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Simonyan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Vinyals</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Graves</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Wavenet: A Generative Model for Raw Audio</article-title>. <source>arXiv:1609.03499</source>. <comment>[Online]. Available: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1609.03499">http://arxiv.org/abs/1609.03499</ext-link>.</comment> </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>O&#x2019;Shea</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hoydis</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>An Introduction to Machine Learning Communications Systems</article-title>. <source>ArXiv:1702.00832, Vol. abs/1702.00832</source>. <comment>[Online]. Available: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1702.00832">http://arxiv.org/abs/1702.00832</ext-link>
</comment>. </citation>
</ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Panayotov</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Povey</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Khudanpur</surname>
<given-names>S.</given-names>
</name>
</person-group>, (<year>2015</year>). &#x201c;<article-title>Librispeech: An Asr Corpus Based on Public Domain Audio Books</article-title>,&#x201d; in <conf-name>Proc. of IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name> (<publisher-loc>Queensland, Australia</publisher-loc>: <publisher-name>ICASSP</publisher-name>), <fpage>5206</fpage>&#x2013;<lpage>5210</lpage>. </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pfau</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Hoffmann</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Noe</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Hardware-Efficient Coherent Digital Receiver Concept with Feedforward Carrier Recovery for M-QAM Constellations</article-title>. <source>J.&#x20;Lightwave Technol.</source> <volume>27</volume> (<issue>8</issue>), <fpage>989</fpage>&#x2013;<lpage>999</lpage>. <pub-id pub-id-type="doi">10.1109/jlt.2008.2010511</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Saad</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Bennis</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A Vision of 6G Wireless Systems: Applications, Trends, Technologies, and Open Research Problems</article-title>. <source>IEEE Netw.</source> <volume>34</volume> (<issue>3</issue>), <fpage>134</fpage>&#x2013;<lpage>142</lpage>. <pub-id pub-id-type="doi">10.1109/mnet.001.1900287</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Schneider</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Baevski</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Collobert</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Auli</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Wav2vec: Unsupervised Pre-training for Speech Recognition</article-title>. <source>arXiv:1904.05862</source>. <comment>[Online]. Available: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1904.05862">http://arxiv.org/abs/1904.05862</ext-link>
</comment>. </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shannon</surname>
<given-names>C. E.</given-names>
</name>
</person-group> (<year>1948</year>). <article-title>A Mathematical Theory of Communication</article-title>. <source>Bell Syst. Tech. J.</source> <volume>27</volume> (<issue>3</issue>), <fpage>379</fpage>&#x2013;<lpage>423</lpage>. <pub-id pub-id-type="doi">10.1002/j.1538-7305.1948.tb01338.x</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shewalkar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Nyavanandi</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ludwig</surname>
<given-names>S. A.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Performance Evaluation of Deep Neural Networks Applied to Speech Recognition: RNN, LSTM and GRU</article-title>. <source>J.&#x20;Artif. Intelligence Soft Comput. Res.</source> <volume>9</volume> (<issue>4</issue>), <fpage>235</fpage>&#x2013;<lpage>245</lpage>. <pub-id pub-id-type="doi">10.2478/jaiscr-2019-0006</pub-id> </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shi</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>From Semantic Communication to&#x20;Semantic-Aware Networking: Model, Architecture, and Open Problems</article-title>.&#x20;<source>arXiv:2012.15405</source>. <comment>[Online]. Available: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2012.15405">http://arxiv.org/abs/2012.15405</ext-link>
</comment>. </citation>
</ref>
<ref id="B36">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tran</surname>
<given-names>N. H.</given-names>
</name>
<name>
<surname>Bao</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zomaya</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Nguyen</surname>
<given-names>M. N. H.</given-names>
</name>
<name>
<surname>Hong</surname>
<given-names>C. S.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Federated Learning over Wireless Networks: Optimization Model Design and Analysis</article-title>,&#x201d; in <conf-name>Proc. IEEE Conference on Computer Communications</conf-name> (<publisher-loc>Paris, France</publisher-loc>: <publisher-name>IEEE</publisher-name>). </citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Uysal</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Kaya</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Ephremides</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Gross</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Codreanu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Popovski</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Semantic Communications in Networked Systems</article-title>. <source>arXiv:2103.05391</source>. <comment>[Online]. Available: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2103.05391">http://arxiv.org/abs/2103.05391</ext-link>
</comment>. </citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Saad</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deep Learning for Optimal Deployment of UAVs with Visible Light Communications</article-title>. <source>IEEE Trans. Wireless Commun.</source> <volume>19</volume> (<issue>11</issue>), <fpage>7049</fpage>&#x2013;<lpage>7063</lpage>. <pub-id pub-id-type="doi">10.1109/TWC.2020.3007804</pub-id> </citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xie</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A Lite Distributed Semantic Communication System for Internet of Things</article-title>. <source>IEEE J.&#x20;Select. Areas Commun.</source> <volume>39</volume> (<issue>1</issue>), <fpage>142</fpage>&#x2013;<lpage>153</lpage>. <pub-id pub-id-type="doi">10.1109/jsac.2020.3036968</pub-id> </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xie</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Qin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>G. Y.</given-names>
</name>
<name>
<surname>Juang</surname>
<given-names>B.-H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deep Learning Enabled Semantic Communication Systems</article-title>. <source>arXiv:2006.10685</source>. <comment>[Online]. Available: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2006.10685">http://arxiv.org/abs/2006.10685</ext-link>
</comment>. </citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Federated Learning via Over-the-air Computation</article-title>. <source>IEEE Trans. Wireless Commun.</source> <volume>19</volume> (<issue>3</issue>), <fpage>2022</fpage>&#x2013;<lpage>2035</lpage>. <pub-id pub-id-type="doi">10.1109/TWC.2019.2961673</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Saad</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Hong</surname>
<given-names>C. S.</given-names>
</name>
<name>
<surname>Shikh-Bahaei</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Energy Efficient Federated Learning over Wireless Communication Networks</article-title>. <source>IEEE Trans. Wireless Commun.</source> <volume>20</volume> (<issue>3</issue>), <fpage>1935</fpage>&#x2013;<lpage>1949</lpage>. <pub-id pub-id-type="doi">10.1109/TWC.2020.3037554</pub-id> </citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wong</surname>
<given-names>K.-K.</given-names>
</name>
<name>
<surname>Poor</surname>
<given-names>H. V.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Federated Learning for 6G: Applications, Challenges, and Opportunities</article-title>. <source>arXiv:2101.01338</source>. <comment>[Online]. Available: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2101.01338">http://arxiv.org/abs/2101.01338</ext-link>
</comment>. </citation>
</ref>
</ref-list>
</back>
</article>