<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="review-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Comput. Sci.</journal-id>
<journal-title>Frontiers in Computer Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Comput. Sci.</abbrev-journal-title>
<issn pub-type="epub">2624-9898</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">750284</article-id>
<article-id pub-id-type="doi">10.3389/fcomp.2021.750284</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Computer Science</subject>
<subj-group>
<subject>Methods</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>An Evaluation of Speech-Based Recognition of Emotional and Physiological Markers of Stress</article-title>
<alt-title alt-title-type="left-running-head">Baird et&#x20;al.</alt-title>
<alt-title alt-title-type="right-running-head">Recognition of Markers of Stress</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Baird</surname>
<given-names>Alice</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/727887/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Triantafyllopoulos</surname>
<given-names>Andreas</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Z&#xe4;nkert</surname>
<given-names>Sandra</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ottl</surname>
<given-names>Sandra</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Christ</surname>
<given-names>Lukas</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Stappen</surname>
<given-names>Lukas</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1435411/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Konzok</surname>
<given-names>Julian</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sturmbauer</surname>
<given-names>Sarah</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1498356/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Me&#x00DF;ner</surname>
<given-names>Eva-Maria</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/345650/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kudielka</surname>
<given-names>Brigitte M.</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/492/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Rohleder</surname>
<given-names>Nicolas</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/177369/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Baumeister</surname>
<given-names>Harald</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/385425/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Schuller</surname>
<given-names>Bj&#xf6;rn W.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/419411/overview"/>
</contrib>
</contrib-group>
<aff id="aff1">
<label>
<sup>1</sup>
</label>Chair of Embedded Intelligence for Health Care and Wellbeing, University of Augsburg, <addr-line>Augsburg</addr-line>, <country>Germany</country>
</aff>
<aff id="aff2">
<label>
<sup>2</sup>
</label>AudEERING GmbH, <addr-line>Gilching</addr-line>, <country>Germany</country>
</aff>
<aff id="aff3">
<label>
<sup>3</sup>
</label>Institute of Psychology, University of Regensburg, <addr-line>Regensburg</addr-line>, <country>Germany</country>
</aff>
<aff id="aff4">
<label>
<sup>4</sup>
</label>Chair of Health Psychology, FAU Erlangen-Nuremberg, <addr-line>Erlangen</addr-line>, <country>Germany</country>
</aff>
<aff id="aff5">
<label>
<sup>5</sup>
</label>Chair of Clinical Psychology and Psychotherapy, University of Ulm, <addr-line>Ulm</addr-line>, <country>Germany</country>
</aff>
<aff id="aff6">
<label>
<sup>6</sup>
</label>GLAM&#x2014;Group on Language, Audio, and Music, Imperial College London, <addr-line>London</addr-line>, <country>United&#x20;Kingdom</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1150780/overview">Marwa Mahmoud</ext-link>, University of Cambridge, United&#x20;Kingdom</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/471472/overview">Robertas Damasevicius</ext-link>, Silesian University of Technology, Poland</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/181800/overview">Ronald B&#xf6;ck</ext-link>, Otto von Guericke University Magdeburg, Germany</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Alice Baird, <email>alicebaird@ieee.org</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Human-Media Interaction, a section of the journal Frontiers in Computer Science</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>06</day>
<month>12</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>3</volume>
<elocation-id>750284</elocation-id>
<history>
<date date-type="received">
<day>30</day>
<month>07</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>29</day>
<month>10</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2021 Baird, Triantafyllopoulos, Z&#xe4;nkert, Ottl, Christ, Stappen, Konzok, Sturmbauer, Me&#x00DF;ner, Kudielka, Rohleder, Baumeister and Schuller.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Baird, Triantafyllopoulos, Z&#xe4;nkert, Ottl, Christ, Stappen, Konzok, Sturmbauer, Me&#x00DF;ner, Kudielka, Rohleder, Baumeister and Schuller</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these&#x20;terms.</p>
</license>
</permissions>
<abstract>
<p>Life in modern societies is fast-paced and full of stress-inducing demands. The development of stress monitoring methods is a growing area of research due to the personal and economic advantages that timely detection provides. Studies have shown that speech-based features can be utilised to robustly predict several physiological markers of stress, including emotional state, continuous heart rate, and the stress hormone, cortisol. In this contribution, we extend previous works by the authors, utilising three German language corpora including more than 100 subjects undergoing a Trier Social Stress Test protocol. We present cross-corpus and transfer learning results which explore the efficacy of the speech signal to predict three physiological markers of stress&#x2014;sequentially measured saliva-based cortisol, continuous heart rate as beats per minute (BPM), and continuous respiration. For this, we extract several features from audio as well as video and apply various machine learning architectures, including a temporal context-based Long Short-Term Memory Recurrent Neural Network (LSTM-RNN). For the task of predicting cortisol levels from speech, deep learning improves on results obtained by conventional support vector regression&#x2014;yielding a Spearman correlation coefficient (<italic>&#x3c1;</italic>) of 0.770 and 0.698 for cortisol measurements taken 10 and 20&#xa0;min after the stress period for the two corpora applicable&#x2014;showing that audio features alone are sufficient for predicting cortisol, with audiovisual fusion to an extent improving such results. We also obtain a Root Mean Square Error (RMSE) of 38 and 22&#x2009;BPM for continuous heart rate prediction on the two corpora where this information is available, and a normalised RMSE (NRMSE) of 0.120 for respiration prediction (&#x2212;10:&#x2009;10). Both of these continuous physiological signals show to be highly effective markers of stress (based on cortisol grouping analysis), both when available as ground truth and when predicted using speech. This contribution opens up new avenues for future exploration of these signals as proxies for stress in naturalistic settings.</p>
</abstract>
<kwd-group>
<kwd>affective computing</kwd>
<kwd>stress</kwd>
<kwd>computer audition</kwd>
<kwd>paralinguistics</kwd>
<kwd>multimodal</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Understanding how stress manifests in the human body has several meaningful use-cases, from improving safety during driving (<xref ref-type="bibr" rid="B8">Bianco et&#x20;al., 2019</xref>) to early intervention of neurodegeneration (<xref ref-type="bibr" rid="B80">Zafar, 2020</xref>). Stress levels are globally on the rise, primarily due to the increased pressure from work and personal lifestyles (<xref ref-type="bibr" rid="B67">Sharma et&#x20;al., 2021</xref>). Many individuals find themselves constantly dealing with several concurrent tasks, a feat known to put well-being off-balance, particularly during work (<xref ref-type="bibr" rid="B50">Pag&#xe1;n-Casta&#xf1;o et&#x20;al., 2020</xref>). With this in mind, methods that can reduce levels of stress whilst still enabling the desired level of efficiency are highly desirable in workplace environments as they can be used to proactively prevent burnout, which is known to proceed consistent stress (<xref ref-type="bibr" rid="B18">Fendel et&#x20;al., 2020</xref>). During a stress inducing situation, the adrenal glands begin to produce various hormones, of which cortisol (a stress hormone) is the most prominent (<xref ref-type="bibr" rid="B41">Leistner and Menke, 2020</xref>). The production of cortisol responds to the activation of the hypothalamic-pituitary-adrenal (HPA) axis, which begins to secrete the corticotropin-releasing hormone that causes the additional release of the adrenocorticotrophic hormone (ACTH) from the pituitary. The release of such hormones is known to alter other physiological responses, including heart rate (<xref ref-type="bibr" rid="B23">G&#xf6;n&#xfc;late&#x15f; et&#x20;al., 2017</xref>), which in turn affects face colouring (<xref ref-type="bibr" rid="B48">Niu et&#x20;al., 2018</xref>) and speech, particularly during psychosocial stress (<xref ref-type="bibr" rid="B9">Brugnera et&#x20;al., 2018</xref>). With this in mind, the speech signal can (non-intrusively) computationally monitor several states of wellbeing (<xref ref-type="bibr" rid="B11">Cummins et&#x20;al., 2018</xref>). It has shown promise in recent studies to indicate physiological signals which are known to be markers of stress, e.&#x2009;g., correlation with saliva-based cortisol samples (<xref ref-type="bibr" rid="B3">Baird et&#x20;al., 2019</xref>), states of emotional arousal (<xref ref-type="bibr" rid="B70">Stappen et&#x20;al., 2021a</xref>), and co-occurring conditions including anxiety (<xref ref-type="bibr" rid="B4">Baird et&#x20;al., 2020</xref>).</p>
<p>In this study, we extend previous works by the authors (<xref ref-type="bibr" rid="B3">Baird et&#x20;al., 2019</xref>), by more deeply exploring the utility of speech for monitoring stress. We use three German corpora, the <sc>FAU</sc>, <sc>Ulm</sc>- and <sc>Reg-TSST</sc> which were all gathered with the renowned Trier Social Stress Test (TSST) protocol (<xref ref-type="bibr" rid="B35">Kirschbaum et&#x20;al., 1993</xref>), and contain more than 100 subjects in total. In previous studies utilising the <sc>FAU-TSST</sc> dataset (<xref ref-type="bibr" rid="B3">Baird et&#x20;al., 2019</xref>), speech derived features were found to strongly correlate with raw cortisol taken 10 and 20&#xa0;min after the spoken task in the TSST, which supported the use of speech as a marker of stress, mainly as cortisol is known to peak between 10 and 20&#xa0;min after a stress stimulus (<xref ref-type="bibr" rid="B24">Goodman et&#x20;al., 2017</xref>). With this in mind, we aim to more closely explore the connection between spoken features and sequential cortisol samples extracted from saliva. To do this, we will perform a fundamental acoustic analysis of grouped signals and then, via a deep learning recognition paradigm, explore each corpus applying transfer learning to validate the efficacy of speech-based cortisol recognition on unseen data. Furthermore, as the <sc>Ulm</sc>- and <sc>Reg-TSST</sc> corpora both contain continuous heart rate as beats per minute (BPM), and <sc>Ulm-TSST</sc> additionally includes respiration signals, we aim to recognise these signals and explore their relationship with the saliva-based cortisol samples to validate their use as markers of stress. There are also two speech scenarios within a TSST, the job interview and arithmetic task, separating these&#x2014;we will also explore how the speech duration and activation in general effects recognition rates with a more fine-grained continuous analysis. Finally, we utilise visual-based features (where available) for multimodal recognition of relevant stress bio-markers and compare the performance of this to&#x20;audio.</p>
<p>To summarise, the following analysis includes several insights and contributions. At the core, this work extends on previous results by the authors <xref ref-type="bibr" rid="B3">Baird et&#x20;al. (2019)</xref>, and for the first time, explores the task of sequentially sampled cortisol prediction from multimodal data within a deep learning-based architecture, validating the experimental paradigm via utility of a novel dataset. Therefore the areas explored through the utilisation of sequential cortisol as ground truth for stress are fourfold: 1) The utility of speech plus multimodal features for recognition of other physiological-derived signals is validated or not. 2) A fundamental acoustic analysis of speech under stress is conducted, utilising cortisol derived groupings as a ground truth for stress. 3) Multi-domain experiments are conducted to further validate previous works&#x2019; findings with newly collected&#x20;data.</p>
<p>This article is organised as follows; firstly, in <xref ref-type="sec" rid="s2">Section 2</xref>, we provide a brief overview of related studies in the area of stress recognition. We then introduce the three corpora that have been used within the experiments in cf. <xref ref-type="sec" rid="s3">Section 3</xref>, as well as offering detail of the TSST study procedure in general. This is followed by an acoustic analysis of each corpus in <xref ref-type="sec" rid="s4">Section 4</xref>. We then outline the experimental set-up for the recognition tasks, in <xref ref-type="sec" rid="s5">Section 5</xref>, and present our experimental results in <xref ref-type="sec" rid="s6">Section 6</xref>, with a mention of study limitation in <xref ref-type="sec" rid="s7">Section 7</xref>. Finally, we offer concluding remarks and future outlook in <xref ref-type="sec" rid="s8">Section&#x20;8</xref>.</p>
</sec>
<sec id="s2">
<title>2 Related Work</title>
<p>Stress recognition has been an active research area within the machine learning and affective computing communities for several years, thus making an extensive summary of this area of research beyond the scope of the current work. In this section, we discuss various approaches which have motivated aspects of our work and would suggest that the interested reader is directed to <xref ref-type="bibr" rid="B51">Panicker and Gayathri (2019)</xref> or <xref ref-type="bibr" rid="B25">Grzadzielewska (2021)</xref>, for a survey on stress recognition in general, and to <xref ref-type="bibr" rid="B19">Garcia-Ceja et&#x20;al. (2018)</xref>, for mental health state recognition using machine learning.</p>
<p>As mentioned, speech as a marker of stress was explored in <xref ref-type="bibr" rid="B3">Baird et&#x20;al. (2019)</xref>, and sequentially measured cortisol samples were for the first time recognised in a traditional machine learning paradigm utilising a support vector regressor (SVR) with hand-crafted and image-based speech-derived features. Findings from this study showed that elevated cortisol levels&#x2014;taken between 10 and 20&#xa0;min after the TSST, i.&#x2009;e., the time of speech under stress&#x2014;correlate to a substantial level (Spearman&#x2019;s correlation coefficient (<italic>&#x3c1;</italic>) of at best 0.421) with hand-crafted prosodic related feature sets performing&#x20;best.</p>
<p>Aside from the work presented in <xref ref-type="bibr" rid="B3">Baird et&#x20;al. (2019)</xref>, there have been limited computational machine learning-based works which have looked at sequentially samples saliva-based cortisol. However, in <xref ref-type="bibr" rid="B47">Nath et&#x20;al. (2021)</xref>, the authors aim to provide a system for monitoring stress in older subjects and in this case instead of explicitly recognising the raw cortisol values, they utilise the samples to produce a ground truth for stress or no stress, in order to perform a binary classification on the subjects. Instead of speech-derived features, the authors perform various experiments based on the features extracted from wearable sensors, e.&#x2009;g., blood volume pressure and electrodermal activity. In this study, in particular, the authors find substantial improvement through the use of an LSTM-based deep learning architecture, obtaining an accuracy for the 2-class problem above 90% <italic>F</italic>
<sup>1</sup> via the feature selection of physiological based signals. Such results show promise for the use of deep learning in the context of stress recognition.</p>
<p>There are several machine learning approaches that have explored physiologically derived markers in general for stress recognition (<xref ref-type="bibr" rid="B43">MacLaughlin et&#x20;al., 2011</xref>; <xref ref-type="bibr" rid="B14">Dhama et&#x20;al., 2019</xref>; <xref ref-type="bibr" rid="B61">&#x160;alkevicius et&#x20;al., 2019</xref>). From feature-based machine learning paradigms which classify various features extracted from wearable sensors, i.&#x2009;e., sleep quality, and percentage of screen time (<xref ref-type="bibr" rid="B62">Sano and Picard, 2013</xref>), or heart rate variability (HRV) (<xref ref-type="bibr" rid="B13">Dalmeida and Masala, 2021</xref>), and thermal-video recognition of the Initial Systolic Time Interval (<xref ref-type="bibr" rid="B40">Kumar S. et&#x20;al., 2021</xref>), applying the state-of-the-art StressNet. StressNet consists of a Long Short-Term Memory (LSTM)-based architecture to harness spatial-temporal aspects of a continuous signal. Similarly, in a recent study, the DeepBreath system has been presented (<xref ref-type="bibr" rid="B10">Cho et&#x20;al., 2017</xref>), a CNN-based architecture which was applied to small-scale datasets for stress recognition, and obtains up to 84.59% accuracy for a binary stress task and 56.52% for a 3-class problem. <xref ref-type="bibr" rid="B39">Kumar A. et&#x20;al. (2021)</xref> present a hierarchical deep neural network that learns high-level feature representations for each type of physiological signal.</p>
<p>The use-cases associated with these approaches vary, with works in recent years being targeted at products including driver monitoring (<xref ref-type="bibr" rid="B29">Healey and Picard, 2005</xref>). However, a major limitation for such stress research is that stress can be potentially harmful to individuals, thus raising ethical concerns which make the collection of spontaneous and natural stress occurrences difficult in practice. With this in mind, the TSST is a standardised and common paradigm (<xref ref-type="bibr" rid="B64">Schmidt et&#x20;al., 2018</xref>), which some stress targeted corpora have applied as it is known to induce moderate psychosocial stress to subjects (<xref ref-type="bibr" rid="B15">Dickerson and Kemeny, 2004</xref>; <xref ref-type="bibr" rid="B55">Plarre et&#x20;al., 2011</xref>). Smaller-scale datasets following these established protocols and have been collected and used for machine learning-based stress recognition (<xref ref-type="bibr" rid="B12">Cuno et&#x20;al., 2020</xref>). The SWELL dataset (<xref ref-type="bibr" rid="B36">Koldijk et&#x20;al., 2014</xref>) (25 subjects, 8 female), is one where <italic>time-pressure</italic> and <italic>interruptions</italic> are integrated in the task which the subjects are asked to perform. In a similar way, cognitive load is another method for inducing stress, and in the renowned SUSAS (<xref ref-type="bibr" rid="B28">Hansen and Bou-Ghazale, 1997</xref>) corpora (aimed at robust speech processing from stressed and emotional speech), 32 subjects are perform various &#x201c;tracking&#x201d; tasks, which increase in their complexity.</p>
<p>Several studies based on these available datasets utilise classical machine learning methods to explore the relationship of multimodal features with stress. In <xref ref-type="bibr" rid="B57">Rodr&#xed;guez-Arce et&#x20;al. (2020)</xref>, the authors apply a Support Vector Machine (SVM), k-Nearest Neighbours (KNN), Random Forest and Logistic Regression (LogR) classifiers to analyse the accuracy of feature subsets based on various modalities, e.&#x2009;g., heart rate, respiration, and galvanic skin response. The limited available data makes deep learning approaches a challenge, however in the 2021 Multimodal Sentiment Analysis in Real-Life Media Challenge (MuSe) (<xref ref-type="bibr" rid="B70">Stappen et&#x20;al., 2021a</xref>), the <sc>Ulm-TSST</sc> corpus was presented and successfully utilised for emotion-based stress recognition during a TSST. The baseline for the <italic>Multimodal Emotional Stress sub-challenge</italic>&#x2009;(<sc>MuSe-Stress</sc>) task (recognition of valence and arousal during stress) applies an LSTM-RNN with a late multimodal fusion of audio plus video-based features, obtaining a concordance correlation coefficient (CCC) of 0.509 (for combined arousal and valence). Audio features perform best for the uni-modal approaches in the MuSe paradigm, with <sc>eGeMAPS</sc> &#x2009; (<xref ref-type="bibr" rid="B16">Eyben et&#x20;al., 2016</xref>) features yielding a CCC of 0.472, compared to a CCC of 0.305 for video-based <sc>VGGface</sc>&#x2009; features.</p>
<p>From this literature overview, it is clear that there is missing analysis in the literature, and need to explore more deeply the utility of markers of stress e.&#x2009;g., cortisol, in a machine learning paradigm. Computational understanding of cortisol is particularly meaningful, as it is known that, sustained levels of stress are substantial contributors to neurodegeneration (<xref ref-type="bibr" rid="B80">Zafar, 2020</xref>), with biological markers of this including fluctuations in neurotransmitters, e.&#x2009;g., dopamine or serotonin, and levels of stress hormones including cortisol, with <xref ref-type="bibr" rid="B80">Zafar (2020)</xref>. More specifically, in <xref ref-type="bibr" rid="B60">Saitis and Kalimeri (2018)</xref> the authors use related bio-markers to automatically detect environments that are stressful for visually impaired persons which might help to improve accessibility within public spaces. This illustrates that successful monitoring of stress via such markers has a benefit beyond commercial applications.</p>
<p>Furthermore, as can be seen, current studies are largely based on smaller-scale corpora (ca. 30 subjects), with the current contribution attempting to go deeper by not only exploring across multiple corpora but in general including a more substantial number of speakers (&#x2b;100) than is typically observed in the literature thus far. As well as this, applying deep learning, particularly an LSTM-RNN, appears to be a valid deep learning architecture for modelling states of continuous stress, and motivates us to explore the use of this in comparison to more robust models, e.&#x2009;g., the SVR. Finally, in <xref ref-type="bibr" rid="B3">Baird et&#x20;al. (2019)</xref> no other modalities were explored for recognising the cortisol-derived markers of stress, neither in a uni- or multimodal manner, and so this strongly motivates the current work to explore how vision-based features perform in this setting.</p>
</sec>
<sec id="s3">
<title>3 Corpora</title>
<p>For our experiments we utilise three corpora&#x2014;the FAU-Trier Social Stress Test (<sc>FAU-TSST</sc>), the Regensburg-Trier Social Stress Test (<sc>Reg-TSST</sc>), and the Ulm-Trier Social Stress Test (<sc>Ulm-TSST</sc>)&#x2014;which all include subjects undergoing the renowned and highly standardised Trier Social Stress Test (TSST) (<xref ref-type="bibr" rid="B35">Kirschbaum et&#x20;al., 1993</xref>). All subjects were speaking in the German language and were recorded at Universities from southern German states (Bavaria and Baden-W&#xfc;rttemberg). After processing, the total amount of speakers is 134, including 50 males and 84 females. The <sc>FAU-TSST</sc> corpus was first introduced in <xref ref-type="bibr" rid="B3">Baird et&#x20;al. (2019)</xref>, and the <sc>Ulm-TSST</sc> corpus in <xref ref-type="bibr" rid="B70">Stappen et&#x20;al. (2021a)</xref>. In <xref ref-type="table" rid="T1">Table&#x20;1</xref>, we provide an overview of all data available across each corpus. As can be seen, the only modality available in all corpora is audio. All three corpora have speech data from the job interview (interview) task (described in detail below), however the <sc>Ulm-TSST</sc> corpus does not include the arithmetic&#x20;task.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>An overview of each of the three corpora (<sc>FAU</sc>) (<sc>Reg</sc>) and (<sc>Ulm</sc>)-TSST used within this contribution. Including, number of subjects (&#x23;), distribution of gender (M)ale: (F)emale, Age in years (mean/standard deviation), continuous signals available for each&#x2014;(A)udio, (V)ideo, heart rate as beats per minute (B)PM, (R)espiration, (C)ortisol and (E)motion (arousal and valence)&#x2014;as well as, the speaker independent partitions, train, (dev)elopment and test, and the duration of audio data, after voice activity detection (VAD) and for each TSST task, (Inter)view, and (Arith)methic.</p>
</caption>
<table>
<thead>
<tr>
<td rowspan="2" align="left"/>
<td rowspan="2" align="center">&#x23; (M:F)</td>
<td rowspan="2" align="center">Age <italic>&#x3bc;</italic>/&#xb1;</td>
<td colspan="6" align="center">Modes</td>
<td colspan="4" align="center">Duration (hh:&#x2009;mm)</td>
<td colspan="4" align="center">Partitions</td>
</tr>
<tr>
<td align="center">A</td>
<td align="center">V</td>
<td align="center">B</td>
<td align="center">R</td>
<td align="center">C</td>
<td align="center">E</td>
<td align="left"/>
<td align="center">VAD</td>
<td align="center">Inter.</td>
<td align="center">Arith.</td>
<td align="center">Train</td>
<td align="center">Dev</td>
<td align="center">Test</td>
<td align="center">
<italic>&#x2211;</italic>
</td>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">
<sc>FAU</sc>
</td>
<td align="center">43 (14:29)</td>
<td align="center">24.26/4.97</td>
<td align="center">&#x22A0;</td>
<td align="center">&#x22A0;</td>
<td align="center">&#x20DE;</td>
<td align="center">&#x20DE;</td>
<td align="center">&#x22A0;</td>
<td align="center">&#x20DE;</td>
<td align="center">7:&#x2009;25</td>
<td align="center">4:&#x2009;20</td>
<td align="center">2:&#x2009;32</td>
<td align="center">1:&#x2009;48</td>
<td align="center">15</td>
<td align="center">15</td>
<td align="center">13</td>
<td align="center">43</td>
</tr>
<tr>
<td align="left">
<sc>Reg</sc>
</td>
<td align="center">27 (13:14)</td>
<td align="center">22.74/2.96</td>
<td align="center">&#x22A0;</td>
<td align="center">&#x20DE;</td>
<td align="center">&#x22A0;</td>
<td align="center">&#x20DE;</td>
<td align="center">&#x22A0;</td>
<td align="center">&#x20DE;</td>
<td align="center">4:&#x2009;28</td>
<td align="center">2:&#x2009;26</td>
<td align="center">1:&#x2009;24</td>
<td align="center">1:&#x2009;02</td>
<td align="center">10</td>
<td align="center">9</td>
<td align="center">8</td>
<td align="center">27</td>
</tr>
<tr>
<td align="left">
<sc>Ulm</sc>
</td>
<td align="center">69 (20:49)</td>
<td align="center">25.06/4.48</td>
<td align="center">&#x22A0;</td>
<td align="center">&#x22A0;</td>
<td align="center">&#x22A0;</td>
<td align="center">&#x22A0;</td>
<td align="center">&#x20DE;</td>
<td align="center">&#x22A0;</td>
<td align="center">5:&#x2009;47</td>
<td align="center">2:&#x2009;21</td>
<td align="center">2:&#x2009;21</td>
<td align="center">&#x2013;</td>
<td align="center">41</td>
<td align="center">14</td>
<td align="center">14</td>
<td align="center">69</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s3-1">
<title>3.1 The Trier Social Stress Test Procedure</title>
<p>Each testing site obtained ethical approval from their respective university&#x2019;s ethics committee to perform the TSST study. In all cases, subjects were recruited from the university campus and the community via print and multi-media advertising and received monetary compensation. The study was carried out in accordance with the declaration of Helsinki, and informed consent was obtained from all subjects at study entry. For the <sc>Reg-TSST</sc> eligible, subjects were then invited to a first laboratory session to conduct a structured clinical interview (<xref ref-type="bibr" rid="B77">Wittchen et&#x20;al., 1997</xref>) for exclusion of acute or chronic psychiatric diseases. Further exclusion criteria applied to all corpora included; acute or chronic somatic diseases, psychotropic or glucocorticoid medication intake, BMI above 30&#xa0;kg/m<sup>2</sup>, drug abuse, and previous experience with the TSST procedure.</p>
<p>For all corpora, the participants did not know the details of the tasks and were given this information upon entering the TSST study room. The prior experience that subjects may have had with these styles of speaking tasks is unknown, although they were not informed of the task details prior to entering the test site. For the interview task, the participants were not restricted to a particular vacant position but rather considered it to be the interview for their &#x2018;dream roll&#x2019;. Furthermore, it is unknown how many participants had a prior relationship with the panel, although there is likely some previous acquaintance-level relationship given the university location.</p>
<p>In <xref ref-type="fig" rid="F1">Figure&#x20;1</xref> an timeline is given for the general TSST experiment. There was slight variance at each test site; however, we attempt to combine the description of procedure. The TSSTs were scheduled between 12:&#x2009;00 p.m. and 7:&#x2009;00 p.m. to account for the influence of circadian cortisol variations (<xref ref-type="bibr" rid="B58">Rohleder and Nater, 2009</xref>). Instructions for the subjects included instructions to refrain from exercising, smoking, teeth brushing, eating, and drinking anything except water before the arrival. Upon arrival, subjects received verbal and written instructions, followed by a resting period. During this time, for the <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc> a saliva sample (S0 30&#x2013;45&#xa0;min before TSST) was collected as the participant&#x2019;s cortisol baseline, and for the <sc>Reg-TSST</sc> corpus, a sugary drink (chilled herbal tea with 75&#xa0;g of glucose) was given to elevate blood glucose levels (<xref ref-type="bibr" rid="B81">Z&#xe4;nkert et&#x20;al., 2020</xref>). One minute before the next stage, another saliva sample is taken (S1 1&#xa0;min). The subjects are then introduced to the TSST procedure, and guided to a test room, and introduced to observers wearing white lab coats. Subjects were then instructed to take the role of a job applicant and give a 5-min speech to present themselves as the best candidate for a vacant position. This task is where continuous recording begins for the <sc>Reg-TSST</sc> and <sc>Ulm-TSST</sc>
<xref ref-type="fn" rid="fn1">
<sup>1</sup>
</xref>. After this, in the <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc> corpora, subjects were given a mental arithmetic task, for a further 5&#xa0;min, where they should serially subtract 17 from 2&#x2009;043 as quickly as possible. In the case of any error, they were requested to start again. After completion of the TSST speaking tasks, six more saliva based samples are taken from the subjects (S2-S7).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>An overview of the typical TSST paradigm, as applied in <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc>, where <sc>Ulm-TSST</sc> excludes the arithmetic task.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g001.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>3.2 Target Signals</title>
<p>As seen in <xref ref-type="table" rid="T1">Table&#x20;1</xref> overview, there are several signals available for each of the three corpora. As a core task, we focus on the recognition of sequential saliva-based cortisol measures S0 (45&#xa0;min) to S7 (&#x2b;60&#xa0;min), measured in nanomoles per litre (nmol/L). For the <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc> corpora, saliva is collected at the same time-points, before and after the TSST, and stored at &#x2212;20&#xb0;C before extraction. However, for each corpus, the assay (i.&#x2009;e., biochemical analysis procedure) applied to extract cortisol varied, where <sc>FAU-TSST</sc> utilise a chemiluminescence immunoassay (CLIA), and <sc>Reg-TSST</sc> a fluorescence-based immunoassay (DELFIA) meaning that the derived cortisol value is not completely comparable, for further detail on the difference in these procedures the interested reader is directed to <xref ref-type="bibr" rid="B45">Miller et&#x20;al. (2013)</xref>. With this in mind for the experiments in later sections the two corpora will only be utilised in a multi-domain manner, and not with a typical cross-corpus strategy, cf. <xref ref-type="sec" rid="s6">Section&#x20;6</xref>.</p>
<p>Given this, we first want to analyse the variance in raw cortisol between the two corpora, and so we apply a repeated measures&#x2019; analysis of variance (RM-ANOVA) with raw cortisol (S0-S7) as within-subject factor time and the between-subject factor corpora (<sc>FAU-TSST</sc>vs <sc>Reg-TSST</sc>). Due to lack of sphericity (pointing to unequal variances of within-subject measures) we report the Greenhouse-Geisser adjusted <italic>p</italic>-value. We find a significant main effect of the corpora [F (1, 67) &#x3d; 4.02, <italic>p</italic>&#x20;&#x3d; 0.049, <italic>&#x3b7;</italic>
<sup>2</sup>&#x20;&#x3d;&#x20;0.03] indicating that on average <sc>FAU-TSST</sc> raw cortisol is higher compared to <sc>Reg-TSST</sc> raw cortisol. Further, we see a significant&#x20;time &#xd7; corpora interaction [F (1.76, 120.08) &#x3d; 4.52, <italic>p</italic>&#x20;&#x3d; 0.016, <italic>&#x3b7;</italic>
<sup>2</sup> &#x3d; 0.017] with a slightly earlier and higher rise in raw cortisol in <sc>FAU-TSST</sc>compared to <sc>Reg-TSST</sc>. Also, testing the homogeneity of variances of S0&#x2014;S7 with the Levene&#x2019;s Test reveals that for S0-S2, we can assume homogenous variances (<italic>p</italic>&#x20;&#x3e; 0.1) whereas for S3S7, we see inhomogeneous variances (<italic>p</italic>&#x20;&#x3c; 0.05). Whereas variances are comparable for S0-S2, for S3-S7 variances in the <sc>FAU-TSST</sc>corpora for raw cortisol are higher compared to <sc>Reg-TSST</sc>. This suggests a large difference between both corpora regarding intra-individual cortisol trajectories in response to the TSST. For an overview of the raw cortisol in each corpus, cf. the left of <xref ref-type="fig" rid="F2">Figure&#x20;2</xref>, as can be seen at points the variance in the subject&#x2019;s response becomes quite large, which is likely due to some subjects physiologically responding less to this type of stress than others, &#x201c;non-responders&#x201d;, ? As the cortisol of the two corpora is derived with a different assay, and given these statistical differences, the two corpora will be treated individually unless otherwise stated.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Mean of raw cortisol as (nmol/L) for both <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc>, highlighting the stress period in grey, with annotations of sequential saliva (S), and sample time in minutes <bold>(left)</bold>. Distribution of continuous heart rate as BPM for the <sc>Reg-TSST</sc> and <sc>Ulm-TSST</sc> corpora <bold>(middle)</bold>. Distribution of the respiration-based signal as millivolts (mV) based on chest displacement <bold>(right)</bold>.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g002.tif"/>
</fig>
<p>For the <sc>Ulm-TSST</sc> and <sc>Reg-TSST</sc> corpora, we additionally explore the continuous physiological signals available cf. <xref ref-type="fig" rid="F2">Figure&#x20;2</xref> (middle). We utilise heart rate as Beats per Minute (BPM) from the <sc>Reg-TSST</sc> and <sc>Ulm-TSST</sc> corpora, and for the <sc>Ulm-TSST</sc> corpus, we also utilise the respiration signal provided cf. <xref ref-type="fig" rid="F2">Figure&#x20;2</xref> (right), which is based on chest displacement at a range of -10 to &#x2b;10&#x20;mV (mV), where negative indicates an exhalation and positive an inhalation. Both of these physiological signals are known to alter during stress stimuli (<xref ref-type="bibr" rid="B6">Bernardi et&#x20;al., 2000</xref>). Of note, from <xref ref-type="fig" rid="F2">Figure&#x20;2</xref> (centre) we see that the BPM signal for <sc>Reg-TSST</sc> contains values below 50&#x2009;BPM and above 180&#x2009;BPM suggesting some noise in the signal, likely due to the equipment type<xref ref-type="fn" rid="fn2">
<sup>2</sup>
</xref>
</p>
<p>The <sc>Ulm-TSST</sc> corpus also includes continuous emotion ratings, which were rated by three annotators for the dimensions of arousal and valence, at a 2&#xa0;Hz sampling rate. Arousal and valence are derived from Russell&#x2019;s circumplex for affect (<xref ref-type="bibr" rid="B59">Russell, 1980</xref>), and allow for dimensional interpretation of the strength (arousal) and positivity (valence) of an emotion. For these signals, a &#x201c;gold standard&#x201d; is obtained by the fusion of annotator ratings, utilising the <sc>RAAW</sc> method, implemented using the MuSe-Toolbox (<xref ref-type="bibr" rid="B72">Stappen et&#x20;al., 2021c</xref>). The mean Pearson correlation inter-rater agreement for these fused signals are 0.186 (&#xb1;0.230) for arousal, and 0.204 (&#xb1;0.200) for valence.</p>
</sec>
<sec id="s3-3">
<title>3.3 Data Processing</title>
<p>For the <sc>FAU-TSST</sc> and <sc>Ulm-TSST</sc> corpora, the audio data was extracted from the video camera, placed approximately 3&#xa0;m from the subject. For the <sc>Reg-TSST</sc> corpus, two channels of audio were captured, and for the experiments, we utilise the first channel, which was recorded using the AKG PW45 presenter set with a close-talk microphone. All audio was converted to 16&#xa0;kHz, 16&#xa0;bit, mono, WAV format and applying peak normalisation to 1&#xa0;dB for each audio file, i.&#x2009;e., adjusting the loudness based on the maximum amplitude of the signal, before extracting features. We re-ran the processing procedure for the <sc>FAU-TSST</sc> corpus that was first presented in <xref ref-type="bibr" rid="B3">Baird et&#x20;al. (2019)</xref> to include portions of non-speech, and match <sc>Ulm-TSST</sc> and <sc>Reg-TSST</sc>. For the audio of all corpora, we applied <italic>voice activity detection</italic> (VAD), utilising the LSTM-RNN approach described by <xref ref-type="bibr" rid="B26">Hagerer et&#x20;al. (2017)</xref>. This method utilises spectral and MFCC-based features to generate frame-level VAD decisions with a granularity of 20&#xa0;ms. The model was trained in a multitask setting to jointly predict speech overlap, gender, and speech probability, achieving an overall performance of 93% <italic>F</italic>
<sup>1</sup>-score for speech detection. From this procedure in, cf. <xref ref-type="table" rid="T1">Table&#x20;1</xref>, it can be seen that the arithmetic task contains less speech, and in general, there appears to be substantial silence within the audio data, likely caused by the induced stress.</p>
<p>For all corpora, we create segments from the continuous signal. For <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc>, this is based on speech start (provided by the VAD), until the next utterance. To be comparable to the MuSe challenge, we do not alter the segmentation applied to <sc>Ulm-TSST</sc>. As the text is also available for this corpus, the segmentation is based on aligned transcription (cf. <xref ref-type="bibr" rid="B70">Stappen et&#x20;al. (2021a)</xref> for further detail). Each corpus is then partitioned in a speaker-independent manner into training, development, and test sets, cf. <xref ref-type="table" rid="T1">Table&#x20;1</xref>, where demographics including age and gender are balanced as best possible.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Acoustic Analysis</title>
<p>To further analyse the manifestation of stress in the human voice and explore each of the corpora utilised in our experiments more deeply, we extract the low-level acoustic features over the entire speech sample prior to segmentation for each speaker. We extract the fundamental frequency (<italic>F</italic>
<sub>0</sub>) and volume intensity (dB), as these are aspects of speech known to vary during stress (<xref ref-type="bibr" rid="B56">Protopapas and Lieberman, 1997</xref>; <xref ref-type="bibr" rid="B21">Giddens et&#x20;al., 2013</xref>). For the <italic>F</italic>
<sub>0</sub> extraction, we remove zero-values in other words, non-voiced parts to not skew the result based on segments of silence in the audio files; however, we consider the silence for intensity.</p>
<p>We first explore the acoustic behaviour in relation to the raw cortisol samples (nmol/L) from the <sc>FAU-TSST</sc>, and <sc>Reg-TSST</sc> corpora, in groupings of 3-classes (lower 33rd, middle, and higher 66th percentile) at each sample time-point. It can be assumed that a higher feeling of stress leads to a higher cortisol response although with some delay <xref ref-type="bibr" rid="B24">Goodman et&#x20;al. (2017)</xref>. Given the variance in cortisol responses, as seen by the reasonably large standard deviation at each time-point <xref ref-type="fig" rid="F2">Figure&#x20;2</xref>, these coarse groupings allow us to observe the behaviour of subjects with higher cortisol response against those with lower response to understand how if at all acoustic features relate to high states of stress. As the cortisol targets for each corpus were extracted with a different assay (cf. <xref ref-type="sec" rid="s3">Section 3</xref>), we perform the grouping individually for each based on the percentile distribution. For <sc>FAU-TSST</sc> 33rd <inline-formula id="inf1">
<mml:math id="m1">
<mml:mo>&#x3c;</mml:mo>
<mml:mn>4.90</mml:mn>
</mml:math>
</inline-formula> nmol/L, middle 4.90,&#x2014;, 9.05&#xa0;nmol/L, 66th <inline-formula id="inf2">
<mml:math id="m2">
<mml:mo>&#x3e;</mml:mo>
<mml:mn>9.05</mml:mn>
</mml:math>
</inline-formula> nmol/L , and for <sc>Reg-TSST</sc> 33rd <inline-formula id="inf3">
<mml:math id="m3">
<mml:mo>&#x3c;</mml:mo>
<mml:mn>4.18</mml:mn>
</mml:math>
</inline-formula> nmol/L middle 4.18&#x2013;6.79&#xa0;nmol/L 66th <inline-formula id="inf4">
<mml:math id="m4">
<mml:mo>&#x3e;</mml:mo>
<mml:mn>6.79</mml:mn>
</mml:math>
</inline-formula> nmol/L.</p>
<p>It is clear from plotting (cf. <xref ref-type="fig" rid="F3">Figure&#x20;3</xref>) the classes that each corpus behaves similarly at each sequential time step. In general, speakers tend to have a more powerful intensity for the 66th percentile cortisol groupings at S3-S4. In <xref ref-type="fig" rid="F3">Figure&#x20;3</xref>, we see that at S1, those speakers in the lower 33rd percentile show a larger range in intensity, which reduces at S3 to S4. At S3 and S4, the mean intensity in dB also increases, particularly for those with a higher cortisol response; this intensity then decreases as the cortisol begins to lower at S5. In general, we can see from this analysis that those with higher levels of cortisol tend to have louder mean speech volume, and broader range in volume than other cortisol groupings, although this is in general less consistent for <sc>Reg-TSST</sc>, potentially due to the differing microphone or the smaller population in the higher groupings of this corpus\enleadertwodots.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Box plot representation of extracted intensity (dB) and pitch (<italic>F</italic>
<sub>0</sub>) for each speaker. Percentile grouping based on raw cortisol as nmol/L, showing measures taken at time-points S1, S3&#x2013;S5, for <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc> corpora. <bold>(A)</bold> Intensity dB&#x2014;FAU-TSST S1, S3&#x2014;S5. <bold>(B)</bold> Intensity dB&#x2014;R<sc>EG</sc>-TSST S1, S3, S3&#x2014;S5. 3. <bold>(C)</bold> <italic>F</italic>
<sub>0</sub>&#x20;Hz&#x2014;FAU-TSST S1, S3, S3&#x2014;S5. <bold>(D)</bold> <italic>F</italic>
<sub>0</sub>&#x20;Hz&#x2014;R<sc>EG</sc>-TSST S1, S3, S3&#x2014;S5.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g003.tif"/>
</fig>
<p>Interestingly, for <italic>F</italic>
<sub>0</sub> (cf. <xref ref-type="fig" rid="F3">Figure&#x20;3</xref>), we see similar behaviour concerning the cortisol groupings, particularly for the <sc>Reg-TSST</sc>. In this case, the standard deviation of <italic>F</italic>
<sub>0</sub> appears to increase with higher cortisol levels, and the same is true for <sc>FAU-TSST</sc> at S3-S4 although less prominent. As we split results by sex here (male and female) we see that the effect is not consistent for sex groupings, but it seems that at S3 both male and female groups do increase <italic>F</italic>
<sub>0</sub> variance as cortisol response becomes higher, a finding which is consistent with related literature which states the <italic>F</italic>
<sub>0</sub> mean increases as cortisol also increases <xref ref-type="bibr" rid="B54">Pisanski et&#x20;al. (2016)</xref>.</p>
<p>We also explore groupings of mean (<italic>&#x3bc;</italic>) heart rate as beats per minute (BPM), Low <italic>&#x3bc;</italic> &#x3c; 80 BPM Middle <italic>&#x3bc;</italic> 80,&#x2014;, 90 BPM High <italic>&#x3bc;</italic>&#x20;&#x3e; 90 BPM. These groupings were selected to balance the subjects in each group based on the distribution of the signal across both sets. This time, we plot the results for each of the TSST tasks, separately and all together. As with cortisol, we do see a relationship between the physiological BPM groupings and the acoustic features, cf. <xref ref-type="fig" rid="F4">Figure&#x20;4</xref>. For the intensity of the <sc>Reg-TSST</sc> corpus, there is a clear decline in volume as BPM increases for both tasks. This trend is not as clear for <sc>Ulm-TSST</sc>, but the range does increase. When looking at <italic>F</italic>
<sub>0</sub> in the same grouping for BPM cf. <xref ref-type="fig" rid="F4">Figure&#x20;4</xref>, we see slightly more consistency, observing a slight increase in the range for <italic>F</italic>
<sub>0</sub> as <italic>&#x3bc;</italic> BPM increases. This finding is supported by other literature, which has shown that there is a relationship between heart rate and vocal quality (<xref ref-type="bibr" rid="B37">Kovalenko et&#x20;al., 2019</xref>), showing that BPM can be considered an indication of stress as it pertains to cortisol. Furthermore, as with the cortisol groupings we also split by sex for <italic>F</italic>
<sub>0</sub> analysis, and the mean <italic>F</italic>
<sub>0</sub> for both sex does appear to increase with higher heart rate, although this is less consistent for the <sc>Ulm-TSST</sc> corpus and also males as compared to females.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Box plot representation of extracted intensity (dB), and pitch (<italic>F</italic>
<sub>0</sub>) of each speaker. Grouped based on <italic>&#x3bc;</italic> Heart Rate as BPM. Including, <sc>Reg-TSST</sc> interview task, <sc>Reg-TSST</sc> arithmetic task, and <sc>Ulm-TSST</sc> interview task. <bold>(A)</bold> Intensity dB&#x2014;R<sc>EG</sc>-TSST, Interview and Arithmetic, U<sc>LM</sc>-TSST Interview. <bold>(B)</bold> <italic>F</italic>
<sub>0</sub>&#x20;Hz&#x2014;R<sc>EG</sc>-TSST, Interview and Arithmetic, U<sc>LM</sc>-TSST Interview.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g004.tif"/>
</fig>
</sec>
<sec id="s5">
<title>5 Experimental Settings</title>
<p>We conduct four core experiments to explore further the benefits of speech features in the context of recognising markers of stress. As physiological markers are known to strongly affect the HPA axis, which is a factor that alters during a stressful situation, we recognise 1) sequential saliva-based samples of cortisol, utilising the <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc> corpora, where samples taken post-stress (S2&#x2b;) with a strong correlation to the features would indicate an effective approach, 2) continuous emotion, as arousal and valence with <sc>Ulm-TSST</sc> 3) continuous heartbeats per minute (BPM) utilising <sc>Reg-TSST</sc> and <sc>Ulm-TSST</sc>, and 4) continuous respiration, based on chest displacement, from <sc>Ulm-TSST</sc>. Within these paradigms, we perform several cross-corpus (where possible) and transfer learning experiments (results discussed in <xref ref-type="sec" rid="s6">Section 6</xref>) for each of these targets, exploring the efficacy of the machine learning approaches for entirely unlabelled&#x20;data.</p>
<sec id="s5-1">
<title>5.1 Features</title>
<p>We apply a feature-based machine learning approach, and we mainly focus on speech-driven audio features. However, we do include vision features to observe the potential benefit of fusion, and validate the advantage of speech features in this particular context.</p>
<p>
<bold>Acoustic:</bold> From previous studies, we found that hand-crafted features appear to perform more robustly for the task of sequential cortisol prediction (<xref ref-type="bibr" rid="B3">Baird et&#x20;al., 2019</xref>). However, as this was based on a single dataset, further validation was needed, and so for this study, we extract again both hand-crafted speech-based features, namely the <italic>Computational Paralinguistics challengE</italic> (<sc>ComParE</sc>) feature set, and the <italic>extended Geneva Minimalistic Acoustic Parameter Set</italic> (<sc>eGeMAPS</sc>), as well as the deep learning spectrogram based approach utilising the <sc>DeepSpectrum</sc> &#x2009; toolkit from the <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc>. From each audio instance, the <sc>ComParE</sc> &#x2009; and <sc>eGeMAPS</sc> &#x2009; and <sc>DeepSpectrum</sc> &#x2009; features are extracted at a rate of 1&#xa0;s, using an overlapping window of 0.5&#xa0;s. For the hand-crafted sets, we utilise the <sc>openSMILE</sc> &#x2009; toolkit to extract the 6&#x2009;373 dimensional <sc>ComParE</sc> feature set (<xref ref-type="bibr" rid="B17">Eyben et&#x20;al., 2013</xref>), and 88 dimensional <sc>eGeMAPS</sc> feature set (<xref ref-type="bibr" rid="B16">Eyben et&#x20;al., 2016</xref>). These features have shown to be effective for a number of similar wellbeing related tasks (<xref ref-type="bibr" rid="B34">Kim et&#x20;al., 2019</xref>; ? ; <xref ref-type="bibr" rid="B66">Schuller et&#x20;al., 2020</xref>), including detection of early stage dementia (<xref ref-type="bibr" rid="B27">Haider et&#x20;al., 2019</xref>), and levels of anxiety (<xref ref-type="bibr" rid="B4">Baird et&#x20;al., 2020</xref>). For the <sc>DeepSpectrum</sc> &#x2009; features, we extract a 2,560 dimensional feature set of deep data-representations using the <sc>DeepSpectrum</sc> &#x2009; toolkit (<xref ref-type="bibr" rid="B2">Amiriparian et&#x20;al., 2017</xref>). <sc>DeepSpectrum</sc> &#x2009; has shown success for various audio- and speech-based tasks (<xref ref-type="bibr" rid="B44">Mertes et&#x20;al., 2020</xref>), and extracts features from the audio data using pre-trained convolutional neural networks. For this study, we extract features based on the viridis colour map, and the deep features are extracted from the layer <italic>fc7</italic> of AlexNet (<xref ref-type="bibr" rid="B38">Krizhevsky et&#x20;al., 2012</xref>). We also explore the use of <sc>VGGish</sc> &#x2009; functions <xref ref-type="bibr" rid="B30">Hershey et&#x20;al. (2017)</xref> which are pre-trained on AudioSet (<xref ref-type="bibr" rid="B20">Gemmeke et&#x20;al., 2017</xref>). From this, we extract a 128-dimensional <sc>VGGish</sc>&#x2009; embedding vector from the underlying log spectrograms.</p>
<p>
<bold>Visual:</bold> For the video-based features, we utilise the well-established <sc>VGGface</sc> &#x2009; set, and extract this from <sc>FAU-TSST</sc> and <sc>Ulm-TSST</sc> excluding <sc>Reg-TSST</sc> as no video data was available. The first step in this pipeline is to extract the faces as images, and to do this at the same frame-rate as the audio features (2&#xa0;Hz), utilising the <sc>MTCNN&#x2009;</sc> (<xref ref-type="bibr" rid="B82">Zhang et&#x20;al., 2016</xref>) which is pre-trained on the data sets WIDER FACE (<xref ref-type="bibr" rid="B79">Yang et&#x20;al., 2015</xref>) and CelebA (<xref ref-type="bibr" rid="B42">Liu et&#x20;al., 2015</xref>). We use the <sc>VGGface</sc> &#x2009; (version 1) (<xref ref-type="bibr" rid="B52">Parkhi et&#x20;al., 2015</xref>), which is based on the pre-trained deep CNN <sc>VGGish</sc> &#x2009;16, which was introduced by the visual geometry group of Oxford (<xref ref-type="bibr" rid="B68">Simonyan and Zisserman, 2014</xref>). Detaching the top-layer of a pre-trained network results in a 512 feature vector output referred to as <sc>VGGface</sc>.</p>
</sec>
<sec id="s5-2">
<title>5.2 Regressors</title>
<p>For all the recognition tasks, we are performing regression experiments. To do this, we first validate the data itself by performing a series of arguably more robust Support Vector Regression (SVR) experiments for the cortisol targets only. This is then followed by a series of deep learning models based on an LSTM-RNN architecture to explore a more state-of-the-art approach, which may better observe the time-dependent nature of the observed signals.</p>
<p>
<bold>SVR:</bold> For the initial experiments we use the epsilon-support vector regression (SVR) and a linear kernel implementation from the Scikit-Learn toolkit (<xref ref-type="bibr" rid="B53">Pedregosa et&#x20;al., 2011</xref>). For training, the data is split into speaker-independent sets: During the development phase, we trained a series of SVR models, optimising the complexity parameters (<italic>C</italic>&#x20;&#x2208; 10<sup>&#x2013;4</sup>&#x2013;1), evaluating their performance on the development set. We re-trained the model with the concatenated train and development set and evaluated the test set performance.</p>
<p>
<bold>LSTM-RNN:</bold> We utilise a similar LSTM-RNN based architecture to the one which was applied for the baseline of the MuSe 2021 Challenge<xref ref-type="fn" rid="fn3">
<sup>3</sup>
</xref> and similar tasks (<xref ref-type="bibr" rid="B71">Stappen et&#x20;al., 2021b</xref>,<xref ref-type="bibr" rid="B72">c</xref>). In the training processes, the features and labels of every input are further segmented via a windowing approach (<xref ref-type="bibr" rid="B74">Sun et&#x20;al., 2020</xref>), which may offer the network more context. We experimented with various window lengths, but as in the MuSe Challenge, a window size of 300 steps (150&#xa0;s) was found to be optimal for all corpora. We tested <italic>n</italic>&#x20;&#x3d; (1, 2, 4)-layered uni and bidirectional networks with <italic>h</italic>&#x20;&#x3d; (50, 100, 200) hidden states and a learning rate of <italic>lr</italic> &#x3d; (0.00005, 0.0001, 0.005, 0.001). Initial experiments showed that the best results were obtained with a 4-layered network, consisting of two LSTM and two fully-connected (FC) layers, with a hidden size of 50, and a learning rate of 0.00005 (cf. <xref ref-type="fig" rid="F5">Figure&#x20;5</xref> for an overview). To reduce the computational overhead, we utilised these values in all experiments reported&#x20;here.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>LSTM-RNN model architecture. The input sequence {<italic>X</italic>
<sub>
<italic>i</italic>
</sub>, <italic>i</italic>&#x20;&#x2208; [1, <italic>T</italic>]} is first fed to two LSTM layers of hidden size 50. The intermediate representations <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2032;</mml:mo>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> produced by the second LSTM layer are then processed by two FC layers to produce the output sequence {<italic>O</italic>
<sub>
<italic>i</italic>
</sub>, <italic>i</italic>&#x20;&#x2208; [1, <italic>T</italic>]}.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g005.tif"/>
</fig>
<p>
<bold>Model evaluation:</bold> For some targets examined here, we have continuous frame-level labels available. This allows us to use the same formulation as in the MuSe Challenge, where we obtain frame-level predictions using an LSTM-RNN architecture and subsequently compare those to the frame-level target. This is not true for the cortisol task, as only one single target value is available per session. Moreover, each session lasts approximately 10&#xa0;min, and stress may only manifest on short, intermittent segments throughout those recordings. To overcome these challenges, we opted to replicate the session-level labels on the frame and model them accordingly. During training, we use standard many-to-many training (<xref ref-type="bibr" rid="B46">Mousa and Schuller, 2016</xref>), where the networks (SVR and LSTM) are trained to predict the target on all frames. This formulation results in frame-level predictions during evaluation as well. However, as mentioned, we only have a single session-level target. Thus, to evaluate the performance of our models, we first aggregate (i.&#x2009;e., average) their predictions for each session before comparing them to the reference cortisol values.</p>
<p>As primary evaluation metrics for all models, we report either <italic>Spearman</italic>&#x2019;<italic>s correlation coefficient</italic> (<italic>&#x3c1;</italic>), <italic>Root-Mean Square Error</italic> (RMSE) or normalised RMSE (NRMSE). Reporting correlation as <italic>&#x3c1;</italic> is used for the sequential cortisol target, as we are interested in exploring trends in the data and how well the models can learn targets that are derived from a more ordinal value. When discussing specific results for <italic>&#x3c1;</italic> the <italic>p-value</italic> is also reported, to discuss the additionally significance of the correlation. In this case, as with any other <italic>p</italic>-values reported, significance can be consider at values of <italic>p</italic>&#x20;&#x3c; 0.05. RMSE, in contrast, is better suited to a more objective evaluation, which fits the case of time-continuous signals such as heart rate, and given the less intuitive range of the respiration signal, we report NRMSE in this&#x20;case.</p>
</sec>
</sec>
<sec sec-type="results|discussion" id="s6">
<title>6 Results and Discussion</title>
<p>We provide a series of tables and plots to report various aspects of the results obtained by our experiments. For clarity of presentation, we will discuss the results obtained for each of the targets separately.</p>
<sec id="s6-1">
<title>6.1 Sequential Cortisol Prediction</title>
<p>Our main source of truth for the degree of stress during the TSST setting is the saliva-based cortisol measurements obtained at differing time points. This information is only available for the <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc> datasets; therefore, we focus primarily on those two in this section. As discussed in <xref ref-type="sec" rid="s3">Section 3</xref>, the only modality standard across those two datasets is audio, while for <sc>FAU-TSST</sc>, we additionally have video. For this reason, we primarily focus on the audio modality, which in <xref ref-type="bibr" rid="B3">Baird et&#x20;al. (2019)</xref> was shown to be a strong predictor of cortisol-based stress.</p>
<p>Furthermore, as noted in <xref ref-type="sec" rid="s3">Section 3</xref>, cortisol values were derived using different assays, thus making the two scales incompatible. This makes it incorrect to evaluate any trained models with a standard cross-corpus paradigm, whereby models are trained on one dataset and evaluated on another. Instead, the core focus of our experiments is to explore how well the methodology can be replicated on different datasets. Nevertheless, we additionally explore the direction of pooling the data from the two studies and learning a joint model. Pooling more data, which come from fundamentally different domains, e.&#x2009;g., acoustically and the cortisol assay used, might still benefit the training of neural networks, which typically require a lot of data to learn from. We thus train models in both single- and multi-domain settings, and always evaluate them on in-domain data separately for each dataset.</p>
<p>As discussed in <xref ref-type="sec" rid="s3">Section 3</xref>, the subjects performed two tasks during the TSST; a speech interview and an arithmetic task. We hypothesise that subjects behaved differently during each task, and that stress manifested differently in the respective acoustic features. This hypothesis was validated by in the initial experiments of <xref ref-type="bibr" rid="B3">Baird et&#x20;al. (2019)</xref>, where models built on each task separately perform better than models built with both tasks. Thus, for these experiments we additionally differentiate between the interview and the arithmetic tasks, building separate models for each of them, and contrasting their performance to models built after pooling both&#x20;tasks.</p>
<p>We first run a series of experiments with a traditional SVR algorithm and only acoustic features to explore if the <sc>Reg-TSST</sc> dataset performs similarly to <sc>FAU-TSST</sc>, and if the study from <xref ref-type="bibr" rid="B3">Baird et&#x20;al. (2019)</xref> can be replicated for <sc>FAU-TSST</sc> with a slightly adapted methodology (e.&#x2009;g., altered speech segmentation) for data processing. In <xref ref-type="fig" rid="F6">Figure&#x20;6</xref>, we see that the <sc>FAU-TSST</sc> corpus behaves as expected, with correlation strongest after S4 (interview: S3, <sc>FAU-TSST</sc> <sc>eGeMAPS</sc> &#x2009; 0.200, <italic>p</italic>&#x20;&#x3c; 0.05; S4, <sc>FAU-TSST</sc> <sc>eGeMAPS</sc> &#x2009; 0.340&#x20;<italic>p</italic>&#x20;&#x3c; 0.05), slightly weaker for the arithmetic task compared to the interview, which could be caused by the reduced speech in the arithmetic task. For the <sc>Reg-TSST</sc> corpus, the trend is less obvious for all feature sets, particularly for the interview task with <sc>ComParE</sc> &#x2009; features where we see a strong decline from S1. The <sc>eGeMAPS</sc> &#x2009; features appear to perform consistently for both tasks of the <sc>Reg-TSST</sc>, however, in this case the arithmetic task appears to have stronger correlations than the interview, peaking earlier at S3 than <sc>FAU-TSST</sc> for this task, which may indicate the above-mentioned difference in intra-individual stress response during the speech tasks of the two corpora. In general, from these experiments, we not only initially affirm the findings of <xref ref-type="bibr" rid="B3">Baird et&#x20;al. (2019)</xref> that higher correlation is obtained post S2 (in general either S3 or S4) by validating this on an additional corpus, but we also affirm that hand-crafted features are more suited for this task. However, for the novel <sc>Reg-TSST</sc> data, the smaller <sc>eGeMAPS</sc> &#x2009; set is performing more robustly, and more consistently overall. Given this, we will continue to use <sc>eGeMAPS</sc>&#x2009; as the main acoustic feature set for further experiments.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>SVR results for the <sc>FAU-TSST</sc> <bold>(above)</bold>, and <sc>Reg-TSST</sc> <bold>(below)</bold>. Reporting Spearman&#x2019;s correlation coefficient (<italic>&#x3c1;</italic>) for all scenarios <bold>(left)</bold>, interview task <bold>(middle)</bold>, and arithmetic task <bold>(right)</bold>, for each sequential saliva (S) sample. SVR experiments were conducted with three different acoustic feature sets: <sc>eGeMAPS</sc>, <sc>DeepSpectrum</sc>, and <sc>ComParE</sc>.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g006.tif"/>
</fig>
<p>Results for the LSTM model are shown in <xref ref-type="table" rid="T2">Table&#x20;2</xref>. Again we see that, in line with <xref ref-type="bibr" rid="B3">Baird et&#x20;al. (2019)</xref>, speech-based models can predict cortisol levels samples taken at time points S2-S5 with a medium to strong correlation and a mean peak around S4 (&#x2b;20&#xa0;min after the TSST). This is consistent across both datasets and tasks. However, there are important and interesting differences across different settings.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Spearman&#x2019;s correlation coefficient (<italic>&#x3c1;</italic>) for session-based cortisol at each saliva (S)ample, from S0 45&#xa0;min to S7 &#x2b;60&#xa0;mins. Utilising <sc>eGeMAPS</sc> &#x2009;features for <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc>corpora, for the (Inter)view and (Arith)metic tasks, as well as the mean (<italic>&#x3bc;</italic>.) across all. Where emphasised results indicate a positive correlation above 0.2.</p>
</caption>
<table>
<thead>
<tr>
<td align="left">
<italic>&#x3c1;</italic>
</td>
<td align="center">&#x2014;</td>
<td colspan="8" align="center">
<sc>FAU-TSST</sc>
</td>
</tr>
<tr>
<td align="left">Train</td>
<td align="center">Task</td>
<td align="center">S0</td>
<td align="center">S1</td>
<td align="center">S2</td>
<td align="center">S3</td>
<td align="center">S4</td>
<td align="center">S5</td>
<td align="center">S6</td>
<td align="center">S7</td>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">
<sc>FAU</sc>
</td>
<td align="left">Inter</td>
<td align="center">0.104</td>
<td align="center">0.016</td>
<td align="center">
<bold>0.203</bold>
</td>
<td align="center">0.000</td>
<td align="center">
<bold>0.286</bold>
</td>
<td align="center">-0.209</td>
<td align="center">-0.352</td>
<td align="center">-0.324</td>
</tr>
<tr>
<td align="left">
<sc>FAU</sc>
</td>
<td align="left">Arith</td>
<td align="center">
<bold>0.302</bold>
</td>
<td align="center">0.060</td>
<td align="center">
<bold>0</bold>.<bold>236</bold>
</td>
<td align="center">
<bold>0.385</bold>
</td>
<td align="center">
<bold>0</bold>.<bold>396</bold>
</td>
<td align="center">-0.165</td>
<td align="center">-0.242</td>
<td align="center">-0.225</td>
</tr>
<tr>
<td align="left">
<sc>FAU</sc>
</td>
<td align="left">Inter. and Arith</td>
<td align="center">0.077</td>
<td align="center">0.093</td>
<td align="center">0.022</td>
<td align="center">0.099</td>
<td align="center">-0.176</td>
<td align="center">-0.286</td>
<td align="center">-0.555</td>
<td align="center">-0.407</td>
</tr>
<tr>
<td align="left">
<sc>FAU</sc> and <sc>Reg</sc>
</td>
<td align="left">Inter</td>
<td align="center">0.154</td>
<td align="center">0.055</td>
<td align="center">-0.159</td>
<td align="center">0.159</td>
<td align="center">0.044</td>
<td align="center">-0.341</td>
<td align="center">0.016</td>
<td align="center">-0.456</td>
</tr>
<tr>
<td align="left">
<sc>FAU</sc> and <sc>Reg</sc>
</td>
<td align="left">Arith</td>
<td align="center">
<bold>0.335</bold>
</td>
<td align="center">
<bold>0.214</bold>
</td>
<td align="center">
<bold>0.368</bold>
</td>
<td align="center">
<bold>0.374</bold>
</td>
<td align="center">
<bold>0.698</bold>
</td>
<td align="center">
<bold>0.286</bold>
</td>
<td align="center">-0.027</td>
<td align="center">-0.214</td>
</tr>
<tr>
<td align="left">
<sc>FAU</sc> and <sc>Reg</sc>
</td>
<td align="left">Inter. and Arith</td>
<td align="center">0.126</td>
<td align="center">
<bold>0.209</bold>
</td>
<td align="center">-0.077</td>
<td align="center">0.104</td>
<td align="center">0.088</td>
<td align="center">-0.220</td>
<td align="center">-0.632</td>
<td align="center">-0.456</td>
</tr>
<tr>
<td align="left">
<italic>&#x3bc;</italic>
</td>
<td align="left">&#x2014;</td>
<td align="center">0.183</td>
<td align="center">0.108</td>
<td align="center">0.099</td>
<td align="center">0.187</td>
<td align="center">
<bold>0.223</bold>
</td>
<td align="center">-0.156</td>
<td align="center">-0.299</td>
<td align="center">-0.347</td>
</tr>
<tr>
<td align="left"/>
<td align="center">
<bold>&#x2014;</bold>
</td>
<td colspan="8" align="center">
<sc>
<bold>Reg-TSST</bold>
</sc>
</td>
</tr>
<tr>
<td align="left">
<sc>REG</sc>
</td>
<td align="left">Inter</td>
<td align="center">
<bold>0.297</bold>
</td>
<td align="center">
<bold>0.827</bold>
</td>
<td align="center">
<bold>0.527</bold>
</td>
<td align="center">
<bold>0.261</bold>
</td>
<td align="center">
<bold>0.236</bold>
</td>
<td align="center">-0.127</td>
<td align="center">-0.527</td>
<td align="center">-0.079</td>
</tr>
<tr>
<td align="left">
<sc>REG</sc>
</td>
<td align="left">Arith</td>
<td align="center">0.091</td>
<td align="center">
<bold>0.559</bold>
</td>
<td align="center">-0.164</td>
<td align="center">0.091</td>
<td align="center">
<bold>0.455</bold>
</td>
<td align="center">
<bold>0.333</bold>
</td>
<td align="center">
<bold>0.552</bold>
</td>
<td align="center">
<bold>0.406</bold>
</td>
</tr>
<tr>
<td align="left">
<sc>REG</sc>
</td>
<td align="left">Inter. and Arith</td>
<td align="center">0.127</td>
<td align="center">
<bold>0.474</bold>
</td>
<td align="center">0.055</td>
<td align="center">
<bold>0.285</bold>
</td>
<td align="center">
<bold>0.248</bold>
</td>
<td align="center">0.115</td>
<td align="center">-0.273</td>
<td align="center">-0.406</td>
</tr>
<tr>
<td align="left">
<sc>FAU</sc> and <sc>Reg</sc>
</td>
<td align="left">Inter</td>
<td align="center">-0.152</td>
<td align="center">
<bold>0.559</bold>
</td>
<td align="center">
<bold>0.467</bold>
</td>
<td align="center">
<bold>0.200</bold>
</td>
<td align="center">
<bold>0.261</bold>
</td>
<td align="center">-0.018</td>
<td align="center">-0.539</td>
<td align="center">0.164</td>
</tr>
<tr>
<td align="left">
<sc>FAU</sc> and <sc>Reg</sc>
</td>
<td align="left">Arith</td>
<td align="center">-0.212</td>
<td align="center">
<bold>0.267</bold>
</td>
<td align="center">0.055</td>
<td align="center">-0.042</td>
<td align="center">
<bold>0.370</bold>
</td>
<td align="center">
<bold>0.212</bold>
</td>
<td align="center">0.188</td>
<td align="center">0.091</td>
</tr>
<tr>
<td align="left">
<sc>FAU</sc> and <sc>Reg</sc>
</td>
<td align="left">Inter. and Arith</td>
<td align="center">0.006</td>
<td align="center">
<bold>0.584</bold>
</td>
<td align="center">
<bold>0.721</bold>
</td>
<td align="center">
<bold>0.770</bold>
</td>
<td align="center">
<bold>0.442</bold>
</td>
<td align="center">0.176</td>
<td align="center">-0.139</td>
<td align="center">-0.042</td>
</tr>
<tr>
<td align="left">
<italic>&#x3bc;</italic>
</td>
<td align="left">&#x2014;</td>
<td align="center">0.026</td>
<td align="center">
<bold>0.545</bold>
</td>
<td align="center">
<bold>0.279</bold>
</td>
<td align="center">
<bold>0.261</bold>
</td>
<td align="center">
<bold>0.335</bold>
</td>
<td align="center">0.115</td>
<td align="center">-0.123</td>
<td align="center">0.022</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In general, we observe that with the LSTM network, we can better predict cortisol from the arithmetic task of <sc>FAU-TSST</sc>, which slightly contradicts our SVR results and shows that this task can also yield good results if we consider the sequential nature of different frames. This indicates that, for this dataset, subjects either became more stressed during this part of the TSST or that the manifestation of stress in the speech was more pronounced. Based on our manual inspection of the dataset, the second hypothesis seems more plausible, as subjects who struggled during the interview tend to stay completely silent, whereas they would continuously produce utterances (although at short bursts) for the arithmetic task. Moreover, pooling data from both tasks resulted in worse performance when training on individual datasets, pointing towards a different expression of stress in each of&#x20;them.</p>
<p>Overall, for both tasks, we observe higher correlations for times S3-S4, with the interview task tending to peak a bit earlier than the arithmetic one. Given the relative delay between the two tasks, this is in line with our previous research (<xref ref-type="bibr" rid="B3">Baird et&#x20;al., 2019</xref>) showing that speech signals are more correlated with cortisol measurements taken approximately 10&#xa0;min after initial stress. Interestingly, we also observe a high correlation for cortisol measures taken at S1 (1&#xa0;min <italic>before</italic> the TSST) for <sc>Reg-TSST</sc> (particularly for the interview task). When observing the mean score for <sc>Reg-TSST</sc> we see that shows to be the highest peak. On the one hand, it could be considered that this is attributed to increased apprehension by the subjects, leading to more stressed behaviour during the early stage of the TSST; however, as we observed earlier there is lower variability across subjects for measurements at S1 (cf. <xref ref-type="sec" rid="s3">section 3</xref>) which may have made this task easier to&#x20;learn.</p>
<p>Finally, we observe that multi-domain models built by pooling both datasets perform consistently better, while additionally benefiting from the pooling of the interview and arithmetic tasks in the case of <sc>Reg-TSST</sc>. This illustrates that, even though the cortisol measurements in the two datasets are based on fundamentally different scales, the relationship between relative cortisol values and acoustic features remains consistent, allowing the models to benefit from bigger and more diverse data and obtain better performance, as measured by Spearman&#x2019;s correlation.</p>
<p>Even though our quantitative evaluation is performed on the session level, it is interesting to investigate how stress manifests through the audio modality at different time points using our approach. <xref ref-type="fig" rid="F7">Figure&#x20;7</xref> shows frame-wise predictions vs a selection of sequential cortisol values for two subjects, one from each corpus. For subject F-1 from <sc>FAU-TSST</sc> (top), we see a higher deviation from the cortisol ground truth, which settles more during segments of speech, as S3-S4 may be considered the true cortisol release at that time point. It is interesting that for subject R-1 for the <sc>Reg-TSST</sc> in <xref ref-type="fig" rid="F7">Figure&#x20;7</xref> (below) the prediction is more consistent. For the S2&#x20;time point, recognition is more accurate earlier in the speech session, i.&#x2009;e., the interview task. Counter to this, at S5 for the <sc>FAU-TSST</sc> plot; we see that the system struggles to recognise after the interview task, which would indicate that this sample of cortisol is less an indication of the stress response, affirming that the speech signal is a strong predictor for peaks in cortisol which occur due to stress. These individual differences across subjects suggest that speaker-adapted models, which have been shown to improve results for other affective computing tasks (<xref ref-type="bibr" rid="B76">Triantafyllopoulos et&#x20;al., 2021</xref>), could improve the predictive accuracy of stress prediction models as well. We also see, when observing <xref ref-type="fig" rid="F7">Figure&#x20;7</xref>, that the standard deviation is in general smaller for the <sc>Reg-TSST</sc> corpus, possibly indicating the benefit of the close-talk recording method.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Normalised frame-wise cortisol predictions (continuous) vs ground truth (dashed) sampled at times S2-S5 using <sc>eGeMAPS</sc> features for subjects F-1 from <sc>FAU-TSST</sc> and subject R-1 from <sc>Reg-TSST</sc>. The shaded (green) area represents segments detected as having speech by the VAD. Predictions are smoothed with a moving average filter with a window size of 30 for visibility. <bold>(A)</bold> F-1 S2, <bold>(B)</bold> F-1 S3, <bold>(C)</bold> F-1 S4, <bold>(D)</bold> F-1 S5, <bold>(E)</bold> R-1 S2, <bold>(F)</bold> R-1 S3, <bold>(G)</bold> R-1 S4, <bold>(H)</bold> R-1 S5.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g007.tif"/>
</fig>
<p>In addition, to compare the performance of audio, we investigate the effectiveness of video-based models for stress recognition on the <sc>FAU-TSST</sc> dataset, on which the video modality is available. Using an identical experimental protocol, and simply substituting <sc>eGeMAPS</sc> &#x2009; with <sc>VGGface</sc> &#x2009; features. Results are shown in <xref ref-type="fig" rid="F8">Figure&#x20;8</xref>, and as can be seen, the vision features are much lower than those obtained with <sc>eGeMAPS</sc> &#x2009; features. This indicates that in the <sc>FAU-TSST</sc> dataset, the auditory modality is more appropriate for modelling stress, although there is still a similar behaviour where we see a peak in correlation after the point of stress (S2-S4). Moreover, we experiment with early and late multimodal fusion, where we either fuse (concatenate) the features and subsequently train a new model or fuse (average) the predictions of the existing unimodal models. As our acoustic experiments showed that task-specific models perform better, we did not fuse data from both TSST tasks for these experiments. We observe that multimodal fusion can lead to better performance in some cases, most notably for the prediction of cortisol at S2, suggesting that the interview task was more meaningful for these features, however, generally <sc>eGeMAPS</sc>&#x2009; features remain strong as a uni-modal approach.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Spearman&#x2019;s correlation coefficient for session-level cortisol prediction using <sc>VGGface</sc> features on the <sc>FAU-TSST</sc> dataset. We report unimodal visual-based results as well as multimodal ones utilising early and late fusion with <sc>eGeMAPS</sc> features (A &#x2b; V). We also replicate the <sc>eGeMAPS</sc> results from <xref ref-type="table" rid="T2">Table&#x20;2</xref> for easier comparison.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g008.tif"/>
</fig>
<p>Finally, we use the models built on <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc> to predict the likely cortisol levels on the <sc>Ulm-TSST</sc> corpus, for which this information is not available. Although we do not have a ground truth here, we aim to see if the performance is similar concerning peaking cortisol levels after the S2&#x20;time point. To perform these experiments, we use the models built separately on <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc>. These models were built on different scales, stemming from the fact that different assays were used to extract cortisol levels in the two datasets. We furthermore used the models built on data from the interview task alone, as this is the only task available for <sc>Ulm-TSST</sc>. <xref ref-type="fig" rid="F9">Figure&#x20;9</xref> shows the mean predicted cortisol levels from an entire <sc>Ulm-TSST</sc> session; similar to <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc>, we observe a peak in (predicted) cortisol levels at times S3 and S4. The <sc>FAU-TSST </sc>model is returning higher cortisol values; this is consistent with the dataset overview presented in <xref ref-type="sec" rid="s3">Section 3</xref> which shows that cortisol levels are higher for <sc>FAU-TSST</sc>.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>The mean (<italic>&#x3bc;</italic>) predicted cortisol value for each time step on the <sc>Ulm-TSST</sc>dataset using models built on <sc>FAU-TSST</sc>and <sc>Reg-TSST</sc> (above). Below, is the BPM, arousal and valence predictions grouped based on raw cortisol (nmol/L), obtained for time point S4 using models built on <sc>FAU-TSST</sc> <bold>(left-green)</bold> and <sc>Reg-TSST</sc> <bold>(right-orange)</bold>. Low, mid, and high groupings defined separately for each dataset based on percentiles. <bold>(A)</bold> U<sc>LM</sc>-TSST <italic>&#x3bc;</italic> cortsiol in nmol/L. <bold>(B)</bold> Cortisol groups for BPM, arousal, and valence predictions of FAU-TSST <bold>(left-green)</bold> and R<sc>EG</sc>-TSST.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g009.tif"/>
</fig>
<p>In addition, box plots of grouped cortisol levels, and the other biomarkers available to the <sc>Ulm-TSST</sc> corpus show that higher (predicted) cortisol levels correspond to slightly higher BPM and arousal, and slightly lower (negative) valence. Moreover, we observe some noticeable differences between the predictions obtained by the two models. For example, the model built on <sc>Reg-TSST</sc>data shows its lowest cortisol predictions for very narrow beats per minute (BPM), arousal and valence ranges, which is less narrow for <sc>FAU-TSST</sc> at those targets, and for valence the lower percentile shows a broader range for valence than all other groupings. These differences further demonstrate that models trained on different corpora, with differences in the acoustic conditions and the way cortisol levels were measured, can result in models that behave in different ways on out-of-domain data. However, in general, behaviours appears to be consistent.</p>
<p>In summary, our results demonstrate that it is possible to predict cortisol levels taken 10&#x2013;20&#xa0;min (common time frame for the post-stress cortisol peak (<xref ref-type="bibr" rid="B24">Goodman et&#x20;al., 2017</xref>)) after a stressful event using speech as well as video features, with the former performing better in this context. Stratifying the data concerning the task that the subjects were performing additionally reveals an interesting trend; we see a general trend that we are able to better predict cortisol levels from the arithmetic task of, <sc>FAU-TSST</sc> but from the interview task of <sc>Reg-TSST</sc>. This may point to underlying differences in the way subjects experienced and expressed stress in the two data collection procedures; there is overall much fewer speech data in the <sc>Reg-TSST</sc> arithmetic task, which may be another reason for&#x20;this.</p>
<p>As mentioned, cortisol levels constitute our primary source of truth for an individual&#x2019;s stress level. However, these measurements are not easily collected and readily available, e.&#x2009;g., for the <sc>Ulm-TSST</sc> corpus they are missing, and learning from a single value from each session, is a challenge for any machine learning architecture. With this in mind, in the following sections, we further investigate continuous physiological markers of stress which are more readily available and offer a more fine-grained view of stress responses, particularly if combined with a cortisol ground&#x20;truth.</p>
</sec>
<sec id="s6-2">
<title>6.2 Emotional Dimensions</title>
<p>We begin our discussion of alternative markers for stress with the emotional dimensions of arousal and valence (<xref ref-type="bibr" rid="B59">Russell, 1980</xref>). These dimensions are known to be related to stress (<xref ref-type="bibr" rid="B33">Johnson and Anderson, 1990</xref>). The <sc>Ulm-TSST</sc> dataset is the only one of the three datasets examined here, which contains annotations for arousal and valence. These dimensions form the targets for the 2021&#x20;<sc>MuSe-Stress&#x2009;</sc>sub-challenge (<xref ref-type="bibr" rid="B70">Stappen et&#x20;al., 2021a</xref>). As there are no available annotations for <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc>, we proceed to predict emotional values on the interview task for both, using models built on the <sc>Ulm-TSST</sc> dataset. As audio is our core focus and is the only modality commonly shared across all three datasets, we use the <sc>eGeMAPS</sc>&#x2009;-based models developed and released as part of the challenge baseline<xref ref-type="fn" rid="fn4">
<sup>4</sup>
</xref>. Both emotion models show strong performance on the <sc>Ulm-TSST</sc> test set, with the arousal model achieving a CCC of 0.4415, and the valence model one of 0.5019. Moreover, as the <sc>Ulm-TSST</sc> corpus only contains the interview task, we only predict those dimensions at the respective functions for <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc>.</p>
<p>In <xref ref-type="fig" rid="F10">Figure&#x20;10</xref>, we show distribution plots of the arousal and valence predictions for <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc> vs the cortisol measures taken at different time-points. The cortisol values have been grouped in the same way as <xref ref-type="sec" rid="s4">Section 4</xref>, i.&#x2009;e., to their low, medium, and high based on the 33 and 66% percentiles derived from the raw cortisol values for the different datasets. As previously, we do observe different trends across the two datasets. <sc>FAU-TSST</sc> is generally showing positive values for arousal, whereas <sc>Reg-TSST</sc> is showing negative ones. Although these results are based on model predictions and are thus not as reliable as human annotations, they nevertheless shed light on potential differences across the two datasets. Interestingly, subjects in <sc>Reg-TSST</sc> appear generally less aroused compared to those in <sc>FAU-TSST</sc>, which once again points to underlying differences in how subjects reacted during the TSST in the two settings. For the high percentile grouping at S2 (&#x2b;1&#xa0;min after the TSST), we generally observe higher arousal values for both datasets, whereas we observe that lower arousal values are predicted for subjects in the lower cortisol percentile for <sc>FAU-TSST</sc>, as measured at S3 (&#x2b;10&#xa0;min after the TSST).</p>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Box plots of predicted arousal valence values for the <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc>. Grouped based on raw cortisol (nmol/L) measures taken at time points S2-S5 <bold>(left to right)</bold>. <bold>(A)</bold> Arousal with cortisol groups at S2&#x2013;S5 for FAU-TSST. <bold>(B)</bold> Arousal with cortisol groups at S2&#x2013;S5 for R<sc>EG</sc>-TSST. <bold>(C)</bold> Valence with cortisol groups at S2&#x2013;S5 for FAU-TSST. <bold>(D)</bold> Valence with cortisol groups at S2&#x2013;S5 for R<sc>EG</sc>-TSST.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g010.tif"/>
</fig>
<p>We additionally used a two-sample independent <italic>t</italic>-test to test the differences in predicted arousal and valence values for all groups and datasets. Of note, we did not conduct a normality test, the <italic>t</italic>-test is know to be robust to deviations from normality larger sample sizes <xref ref-type="bibr" rid="B63">Sawilowsky and Blair (1992)</xref>. All differences were found to be statistically significant at the <italic>p</italic>&#x20;&#x3c; 0.05, except arousal in lower vs middle percentile-cortisol percentiles measured at times S0 and S2 for <sc>FAU-TSST</sc> and S5 for <sc>Reg-TSST</sc>, mid vs high percentiles measured at S4 for <sc>Reg-TSST</sc>, and low vs high percentiles measured at S3 for <sc>FAU-TSST</sc>. For valence, the only non-significant results were those between the lower vs middle percentiles measured at S5 for both <sc>FAU-TSST</sc>and <sc>Reg-TSST</sc>, and the middle vs high percentiles measured at S4 and S6 for <sc>FAU-TSST</sc>and <sc>Reg-TSST</sc>, respectively. This shows that, even though we lack ground truth values for <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc>, we could use a model trained on a related but different dataset to predict them and obtain strong predictors of stress.</p>
</sec>
<sec id="s6-3">
<title>6.3 Continuous Heart Rate</title>
<p>Stress is known to impact heart rate (HR) (<xref ref-type="bibr" rid="B7">Berntson and Cacioppo, 2004</xref>; <xref ref-type="bibr" rid="B75">Taelman et&#x20;al., 2009</xref>) through its activation of the sympathetic (<xref ref-type="bibr" rid="B22">Goldstein, 1987</xref>) and suppression of the parasympathetic branch of the autonomic nervous system (<xref ref-type="bibr" rid="B1">Akselrod et&#x20;al., 1981</xref>). HR can therefore serve as a vital indicator of stress in modern affective computing applications. As discussed in <xref ref-type="sec" rid="s3">Section 3</xref>, however, only one of the three datasets examined here, the <sc>Reg-TSST</sc> dataset, has both HR and cortisol measurements, whereas the <sc>FAU-TSST</sc> dataset has only cortisol measures and <sc>Ulm-TSST</sc> only HR ones. Thus, the only dataset where we can precisely evaluate the relationship of HR with stress is <sc>Reg-TSST</sc>.</p>
<p>
<xref ref-type="fig" rid="F11">Figure&#x20;11</xref> shows the distribution of ground truth HR values for the <sc>Reg-TSST</sc> dataset vs low, mid, and high cortisol levels taken at different time points. Two-sample independent t-tests show that all results are significant at the <italic>p</italic>&#x20;&#x3c; 0.05 level, except the low vs high percentiles at time S0 and the low vs middle percentiles at time S5. Overall, we observe a rising trend for BPM values as the cortisol levels increase; this is consistent with our expectations and prior work (<xref ref-type="bibr" rid="B7">Berntson and Cacioppo, 2004</xref>; <xref ref-type="bibr" rid="B75">Taelman et&#x20;al., 2009</xref>). This trend is particularly pronounced for S5 (&#x2b;20&#xa0;min after the TSST) showing that higher cortisol values obtained during that time were highly correlated with high BPMs during the TSST. In general, this trend differs to what was observed for the acoustic signals (cf. <xref ref-type="sec" rid="s4">Section 4</xref>), indicating that different modalities may be better at predicting cortisol levels measured at different&#x20;times.</p>
<fig id="F11" position="float">
<label>FIGURE 11</label>
<caption>
<p>Box plots of BPM value. Showing the ground truth <sc>Reg-TSST</sc> and predicted for <sc>FAU-TSST</sc>. Grouped based on raw cortisol (nmol/L) measures taken at time-points S2-S5. <bold>(A)</bold> Predicted BPM with cortisol groups at S2&#x2013;S5 for FAU-TSST. <bold>(B)</bold> Ground truth, BPM with cortisol groups at S2&#x2013;S5 for R<sc>EG</sc>-TSST.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g011.tif"/>
</fig>
<p>As the other dataset used in this study with cortisol measurements, <sc>FAU-TSST</sc>, does not have available HR measures, we attempt to predict BPMs using models built on the other two datasets. Specifically, we use the speech modality of the <sc>Reg-TSST</sc> and <sc>Ulm-TSST</sc> datasets to build a model, which we then use to predict BPMs on the <sc>FAU-TSST</sc> dataset. This is motivated by audio being the only common modality across the three corpora, and also that the effect of HR on the voice has long been established by previous research (<xref ref-type="bibr" rid="B49">Orlikoff and Baken, 1989</xref>). Several prior works have attempted to model HR from voice signals, either as a classification (<xref ref-type="bibr" rid="B65">Schuller et&#x20;al., 2013</xref>) or a regression task (<xref ref-type="bibr" rid="B69">Smith et&#x20;al., 2017</xref>; <xref ref-type="bibr" rid="B32">Jati et&#x20;al., 2018</xref>). <xref ref-type="bibr" rid="B32">Jati et&#x20;al. (2018)</xref> use <sc>eGeMAPS</sc>&#x2009; to predict BPM from speech on the segment level, and achieve an root mean squared error (RMSE) of 12&#x20;BPM.</p>
<p>Inspired by these past findings, we attempt to predict HR in the form of BPMs using speech signals. In line with our previous results for cortisol, we use an long short-term memory (LSTM) architecture on <sc>eGeMAPS</sc> &#x2009; features. As all three datasets were recorded in different locations with potentially different acoustic conditions, we are faced with the well-understood domain mismatch problem (<xref ref-type="bibr" rid="B5">Ben-David et&#x20;al., 2010</xref>), where models trained on data from one domain might not generalise well to different domains. Moreover, as discussed in <xref ref-type="sec" rid="s3">Section 3</xref>, the two datasets cover non-overlapping ranges of the BPM range, with subjects in <sc>Reg-TSST</sc> having a generally lower BPM than subjects in <sc>Ulm-TSST</sc>, and are also recorded in different conditions, with <sc>Ulm-TSST</sc>consisting of far-field and <sc>Reg-TSST</sc>of near-field recordings. To address this issue, we first train two single-domain models using both available datasets in isolation and then train a multi-domain model using data from both datasets. In all cases, we evaluate and report model performance separately for each dataset.</p>
<p>RMSE results are shown in <xref ref-type="table" rid="T3">Table&#x20;3</xref>. Our initial observation is that all models perform better on the <sc>Ulm-TSST</sc> dataset, and that in-domain models perform better than their cross-domain counterparts. Moreover, the multi-domain model does not bring any improvements compared to the single-domain ones. The limited overlap partially explains this in the BPM range for the two datasets; combining the data does not lead to considerable benefits since the target is different. The best performing combination is obtained when training and testing on the <sc>Ulm-TSST</sc> dataset, and achieves an RMSE of 19 BPMs. This is lower than the results reported by previous works (<xref ref-type="bibr" rid="B32">Jati et&#x20;al., 2018</xref>), which were, however, performed on different data and are thus not directly comparable to ours. Moreover, as discussed above, potential movements of the subjects lead to more unreliable measurements, which make the target more of a challenge to&#x20;learn.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>RMSE as BPM for single- and multi-domain results for BPM prediction on the <sc>Reg-TSST</sc> and <sc>Ulm-TSST</sc> corpora using <sc>eGeMAPS</sc> and the LSTM-based architecture.</p>
</caption>
<table>
<thead>
<tr>
<td align="left">
<italic>&#x3c1;</italic>
</td>
<td colspan="2" align="center">
<sc>
<bold>Reg-TSST</bold>
</sc>
</td>
<td colspan="2" align="center">
<sc>
<bold>Ulm-TSST</bold>
</sc>
</td>
</tr>
<tr>
<td align="left">Train</td>
<td align="center">Dev</td>
<td align="center">Test</td>
<td align="center">Dev</td>
<td align="center">Test</td>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">
<sc>Reg-TSST</sc>
</td>
<td align="center">
<bold>39.90</bold>
</td>
<td align="center">
<bold>38.57</bold>
</td>
<td align="center">20.98</td>
<td align="center">22.96</td>
</tr>
<tr>
<td align="left">
<sc>Ulm-TSST</sc>
</td>
<td align="center">36.53</td>
<td align="center">40.80</td>
<td align="center">
<bold>19.32</bold>
</td>
<td align="center">
<bold>22.70</bold>
</td>
</tr>
<tr>
<td align="left">
<sc>Reg-TSST</sc> and <sc>Ulm-TSST</sc>
</td>
<td align="center">36.23</td>
<td align="center">38.96</td>
<td align="center">23.07</td>
<td align="center">23.05</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Emphasised results indicate strongest performance on given evaluation set.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In general, predicting HR from free speech signals is a challenging task and is especially hampered by the lack of information whenever subjects remained silent. This is illustrated in <xref ref-type="fig" rid="F12">Figure&#x20;12</xref>, where we present frame-wise BPM predictions vs ground truth signals for three subjects coming from the <sc>Reg-TSST</sc> and <sc>Ulm-TSST</sc> datasets. As seen in particular for subject U-64 (right), there may be periods of prolonged silence, where the audio modality is unavoidably a bad predictor of HR. Interestingly, even though we found silence periods occurring whenever subjects struggled with finding something to say during the interview task, we do not necessarily see an accompanying rise in BPMs, as seen for subject U-64.</p>
<fig id="F12" position="float">
<label>FIGURE 12</label>
<caption>
<p>Frame-wise BPM predictions vs ground truth for subject R-1 from the <sc>Reg-TSST</sc> dataset and subjects U-39 and U-64 from the <sc>Ulm-TSST</sc> dataset. Shaded (green) area represents segments detected as having speech by the VAD. Predictions have been smoothed with a moving average filter with a window size of 30 for visibility. <bold>(A)</bold> R-1 BPM. <bold>(B)</bold> U-39 BPM. <bold>(C)</bold> U-64 BPM.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g012.tif"/>
</fig>
<p>Despite the relatively low performance obtained by our speech-to-BPM models, we still use them to obtain BPM predictions on the <sc>FAU-TSST</sc> dataset, as we are primarily interested in the usefulness of predicted BPM values for stress modelling. In <xref ref-type="fig" rid="F11">Figure&#x20;11</xref>, we show the distribution of predicted BPM values for cortisol measurements obtained at different time points. Surprisingly, we observe a downward trend for BPMs as the stress level increases. This counterintuitive finding can be explained as follows: when subjects move a lot, the BPM monitoring devices may lead to erroneous measurements. Therefore, rather than these low measurements implying that stress leads to a lower BPM, we interpret them as a demonstration that BPM signals, though theoretically well justified as predictors of stress, are nevertheless a challenge to collect in practice. Thus, BPM alone may be inferior to signals like voice that are easier to manage and provide richer information for evaluation. However, the trend is not what we expect. We still see a separation between different cortisol levels, indicating that predicting HR from speech signals can be a useful proxy for stress prediction. Two-sample independent t-tests show that all differences are significant at the <italic>p</italic>&#x20;&#x3c; 0.05 level except the middle vs high percentiles as measured at&#x20;S4.</p>
</sec>
<sec id="s6-4">
<title>6.4 Respiration</title>
<p>The final biological signal we examine here is respiration derived from chest displacement with a range of (&#x2212;10:&#x2b;10), which, similarly to the emotional dimensions, is only available for <sc>Ulm-TSST</sc>. Based on previous research (<xref ref-type="bibr" rid="B73">Suess et&#x20;al., 1980</xref>), we expect this signal to have a solid connection to stress. Although this physiological signal has strong potential for several affective applications (<xref ref-type="bibr" rid="B78">Wu et&#x20;al., 2010</xref>; <xref ref-type="bibr" rid="B31">Ishii et&#x20;al., 2016</xref>; <xref ref-type="bibr" rid="B83">Zhang et&#x20;al., 2017</xref>), to the best of our knowledge, there has been little work on predicting it from other modalities. As we have both audio and video signals available for <sc>Ulm-TSST</sc>, we attempt to use both to model respiration. However, similar to the other biomarkers, we only use the audio modality when predicting this signal for the other two datasets, as this is the only modality shared among all. Given that process of breathing, and vocalising shares related anatomy, we naturally expect the audio modality to be a strong predictor of respiration. Similarly to the emotional dimensions, cf. <xref ref-type="sec" rid="s6-2">Section 6.2</xref>, we only predict respiration on the interview task of <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc>, as this was the only task available for <sc>Ulm-TSST</sc>.</p>
<p>In <xref ref-type="table" rid="T4">Table&#x20;4</xref>, we show multimodal results for the recognition of respiration rate in the <sc>Ulm-TSST</sc> corpus. As the signal is measured in arbitrary units, we report NRMSE, which is equivalent to the standard RMSE normalised by the target range. Late fusion of the two modalities brings the strongest results. However, unimodal <sc>eGeMAPS</sc> &#x2009; features appear to be only slightly lower than the best multimodal result, indicating that they can be used to predict respiration in isolation. This is not too surprising, as the speech and respiration are likely highly correlated, and artefacts from breath will inherently remain with the audio features. This may be to a lesser degree for the <sc>VGGface</sc>&#x2009; features, mainly due to possible occlusions, which may not observe mouth movement related to deeper breath.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>normalised root mean squared error (NRMSE) results for unimodal and multimodal Audio &#x2b; Video (A &#x2b; V) respiration prediction range [&#x2212;10:10] on the development test sets of the <sc>Ulm-TSST</sc> corpus utilising an LSTMs. For the multimodal results, we perform both early and late fusion.</p>
</caption>
<table>
<tbody valign="top">
<tr>
<td align="left">
<bold>NRMSE</bold>
</td>
<td align="center">
<bold>Dev</bold>
</td>
<td align="center">
<bold>Test</bold>
</td>
</tr>
<tr>
<td align="left">
<sc>eGeMAPS</sc>&#x2009;</td>
<td align="center">0.118</td>
<td align="center">0.122</td>
</tr>
<tr>
<td align="left">
<sc>VGGface</sc>&#x2009;</td>
<td align="center">0.146</td>
<td align="center">0.139</td>
</tr>
<tr>
<td align="left">A &#x2b; V (Early)</td>
<td align="center">0.142</td>
<td align="center">0.143</td>
</tr>
<tr>
<td align="left">A &#x2b; V (Late)</td>
<td align="center">
<bold>0.120</bold>
</td>
<td align="center">
<bold>0.120</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Emphasised results indicate strongest performance on given evaluation set.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>From the box plots of <xref ref-type="fig" rid="F13">Figure&#x20;13</xref>, we can observe that the respiration signals predicted appear to behave in an expected way for such cortisol groupings, for both the <sc>FAU-TSST</sc> and <sc>Reg-TSST</sc> corpora. For example, we observe a rise in respiration levels as cortisol increases for <sc>Reg-TSST</sc>; this trend also manifests for <sc>FAU-TSST</sc> but less pronounced. Two-sample independent t-tests show that all differences are significant at the <italic>p</italic>&#x20;&#x3c; 0.05 level, except the mid vs high percentiles sampled at S1, S3, S6, and S7 for <sc>FAU-TSST</sc>, the high vs low percentiles measured at S5 for <sc>FAU-TSST</sc>, and the low vs mid-percentiles sampled at S4 and S5 for <sc>Reg-TSST</sc>. This shows that predicted respiration signals can be valuable biomarkers of stress. Our results show that respiration can be successfully recognised from both speech and other audio, and that the predicted signals are used for identifying speaker states. As respiration prediction from other modalities remains an underexplored topic, our findings warrant a closer investigation in follow-up&#x20;work.</p>
<fig id="F13" position="float">
<label>FIGURE 13</label>
<caption>
<p>Box plots of predicted respiration values for the <sc>FAU-TSST</sc> <bold>(A)</bold> and <sc>Reg-TSST</sc> <bold>(B)</bold> datasets. Grouped based on raw cortisol (nmol/L) measures taken at time points S2-S5.</p>
</caption>
<graphic xlink:href="fcomp-03-750284-g013.tif"/>
</fig>
</sec>
</sec>
<sec id="s7">
<title>7 Limitations</title>
<p>Dealing with human naturalistic data brings several challenges from a machine learning perspective, and from the analysis we have performed on the three corpora of interest, we see that variance in the display of physiological signals is one such challenge. In this current work, our approach was somewhat &#x201c;brute-force&#x201d; in nature, in that we did not condition the models or &#x201c;correct&#x201d; the targets with consideration to any specific subject or corpora variance. This can be more limiting when it comes to variation due to the assay applied for cortisol extraction. A transformation from raw cortisol values derived from different assays to cortisol factor scores for better comparison has been suggested by <xref ref-type="bibr" rid="B45">Miller et&#x20;al. (2013)</xref>, but this approach needs replication to ascertain its reliability and validity. It would be of interest to explore the benefit of this correction in future work, as well as other personalised training methods which may allow for a more robust result which in turn is more globally generalisable (<xref ref-type="bibr" rid="B76">Triantafyllopoulos et&#x20;al., 2021</xref>).</p>
<p>Further to this, within the corpora themselves, there is a heavy gender bias, which it should be noted may have an implicate effect on the results obtained. For the <sc>FAU-TSST</sc>and <sc>Ulm-TSST</sc>sets, this is particularly prominent. Although this is considered in the acoustic analysis conducted, the manifestation of stress is generally known to vary across genders. In further work, personalised training strategies would aid in exploring this potential bias. Similarly, regarding demographics, the mean age across all corpora is 24.02&#xa0;years, with a reasonably small standard deviation of 4.13&#xa0;years. This of course limits the current work as being only applicable to this age range, due to the inherent variance that stress is known to have throughout a lifetime, from factors including hormonal changes and overall life satisfaction, without deep experiments analysis these results should not be taken to be fully generalisable to a larger and more diverse population.</p>
</sec>
<sec sec-type="conclusion" id="s8">
<title>Conclusion</title>
<p>In the current contribution, we explored several markers of stress, learning from various modalities, with a core focus on the advantage of speech-based features. We processed and unified three different corpora collected under the well-known TSST, and we could verify our previous finding from (<xref ref-type="bibr" rid="B3">Baird et&#x20;al., 2019</xref>) that audio features are best able to predict cortisol measurements taken approximately 15&#xa0;min after the stress event. This effect was validated by a similar behaviour found on unlabelled data. This research establishes that audio can be utilised as a real-time guide for an individuals&#x2019; current state of stress. Furthermore, a similar effect was found when using video-derived features from the face, meaning that a multimodal approach may provide further confidence, particularly given the potential periods of silence during stressful situations. Moreover, we have shown that emotion, heart rate, and respiration can be reliably recognised from speech during stress and have a strong relation to cortisol levels. This is found even when these physiological markers are not available during the data collection process but are predicted using other available modalities, mainly&#x20;audio.</p>
<p>Our extensive analysis primarily shows that audio is suitable for the recognition of several physiological markers of stress. However, we do see, that as with many states of wellbeing, there is a large variance in stress manifestation in an individual, which makes generalisation a challenge. Given this, one needs to explore in follow-up work the potential for personalised machine learning strategies for this domain.</p>
</sec>
</body>
<back>
<sec id="s9">
<title>Data Availability Statement</title>
<p>The datasets analysed for this study are not available in the public research domain, unless explicit consent is given via direct contact with the data owners. Namely, <sc>FAU-TSST</sc>; the Chair of&#x20;Health Psychology, FAU Erlangen-Nuremberg, Germany&#x2014;<sc>Reg-TSST</sc>; the Institute of Psychology, University of Regensburg&#x2014;<sc>Ulm-TSST</sc>; the Chair of Clinical Psychology and Psychology and Psychotherapy, University of Ulm, Germany.</p>
</sec>
<sec id="s10">
<title>Ethics Statement</title>
<p>The studies involving human participants were reviewed and approved by the data owner&#x2019;s university ethics committee: Institute of Psychology, University of Regensburg, Germany&#x2014;Chair of Health Psychology. FAU Erlangen-Nuremberg, Germany&#x2014;Chair of Clinical Psychology and Psychotherapy, University of Ulm, Germany. The patients/participants provided their written informed consent to participate in this study.</p>
</sec>
<sec id="s11">
<title>Author Contributions</title>
<p>AB: literature analysis, data preparation, experimental design, computational analysis, manuscript drafting and preparation. AT: experimental design, computational analysis, manuscript drafting and preparation. SZ: data acquisition, literature analysis, computational analysis, manuscript editing. SO: data preparation, literature analysis, manuscript editing. LC: data preparation, literature analysis, manuscript editing. LS: literature analysis, manuscript editing. JK: data acquisition. SS: data acquisition. E-MM: data acquisition. BK: manuscript editing. NR: manuscript editing. HB: manuscript editing. BS: technical guidance and manuscript editing. All authors revised, developed, read, and approved the final manuscript.</p>
</sec>
<sec id="s12">
<title>Funding</title>
<p>This project received funding from the German Research Foundation&#x2019;s (DFG) Reinhart Koselleck project No. 442218748 (AUDI0NOMOUS), the Zentrales Innovationsprogramm Mittelstand (ZIM) under grant agreement No. 16KN069455 (KIRun), and the DFG grant number KU 1401/6-1&#x20;(BM).</p>
</sec>
<sec sec-type="COI-statement" id="s13">
<title>Conflict of Interest</title>
<p>AT and BS were employed by AudEERING&#x20;GmbH</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s14">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn id="fn1">
<label>1</label>
<p>For physiological signals, the <sc>Reg-TSST</sc> corpus utilised the Polar RS800CX and V800 system, and the <sc>Ulm-TSST</sc> corpus used the BIOPAC Systems,&#x20;MP35.</p>
</fn>
<fn id="fn2">
<label>2</label>
<p>
<sc>Reg-TSST</sc>: Polar RS800CX and V800 system, and <sc>Ulm-TSST</sc>: BIOPAC Systems,&#x20;MP35.</p>
</fn>
<fn id="fn3">
<label>3</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://github.com/lstappen/MuSe2021">https://github.com/lstappen/MuSe2021</ext-link>
</p>
</fn>
<fn id="fn4">
<label>4</label>
<p>
<ext-link ext-link-type="uri" xlink:href="https://github.com/lstappen/MuSe2021">https://github.com/lstappen/MuSe2021</ext-link>
</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Akselrod</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gordon</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ubel</surname>
<given-names>F. A.</given-names>
</name>
<name>
<surname>Shannon</surname>
<given-names>D. C.</given-names>
</name>
<name>
<surname>Berger</surname>
<given-names>A. C.</given-names>
</name>
<name>
<surname>Cohen</surname>
<given-names>R. J.</given-names>
</name>
</person-group> (<year>1981</year>). <article-title>Power Spectrum Analysis of Heart Rate Fluctuation: a Quantitative Probe of Beat-To-Beat Cardiovascular Control</article-title>. <source>Science</source> <volume>213</volume>, <fpage>220</fpage>&#x2013;<lpage>222</lpage>. <pub-id pub-id-type="doi">10.1126/science.6166045</pub-id> </citation>
</ref>
<ref id="B2">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Amiriparian</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gerczuk</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ottl</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Cummins</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Freitag</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Pugachevskiy</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). &#x201c;<article-title>Snore Sound Classification Using Image-Based Deep Spectrum Features</article-title>,&#x201d; in <source>Proc. Interspeech</source> (<publisher-loc>Stockholm, Sweden</publisher-loc>, <volume>2017</volume>, <fpage>3512</fpage>&#x2013;<lpage>3516</lpage>. <pub-id pub-id-type="doi">10.21437/interspeech.2017-434</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Baird</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Amiriparian</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Cummins</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Sturmbauer</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Janson</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Me&#x00DF;ner</surname>
<given-names>E.-M.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Using Speech to Predict Sequentially Measured Cortisol Levels during a Trier Social Stress Test</article-title>,&#x201d; in <source>Proc. Interspeech 2019</source> (<publisher-loc>India</publisher-loc>: <publisher-name>Hyderabad</publisher-name>), <fpage>534</fpage>&#x2013;<lpage>538</lpage>. <pub-id pub-id-type="doi">10.21437/interspeech.2019-1352</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Baird</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Cummins</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Schnieder</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Schuller</surname>
<given-names>B. W.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>An Evaluation of the Effect of Anxiety on Speech&#x2013;Computational Prediction of Anxiety from Sustained Vowels</article-title>,&#x201d; in <source>Proc. INTERSPEECH 2020</source> (<publisher-loc>Shanghai, China</publisher-loc>: <publisher-name>ISCA</publisher-name>), <fpage>4951</fpage>&#x2013;<lpage>4955</lpage>. </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ben-David</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Blitzer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Crammer</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kulesza</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pereira</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Vaughan</surname>
<given-names>J.&#x20;W.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>A Theory of Learning from Different Domains</article-title>. <source>Mach Learn.</source> <volume>79</volume>, <fpage>151</fpage>&#x2013;<lpage>175</lpage>. <pub-id pub-id-type="doi">10.1007/s10994-009-5152-4</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bernardi</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Wdowczyk-Szulc</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Valenti</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Castoldi</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Passino</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Spadacini</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2000</year>). <article-title>Effects of Controlled Breathing, Mental Activity and Mental Stress with or without Verbalization on Heart Rate Variability</article-title>. <source>J.&#x20;Am. Coll. Cardiol.</source> <volume>35</volume>, <fpage>1462</fpage>&#x2013;<lpage>1469</lpage>. <pub-id pub-id-type="doi">10.1016/s0735-1097(00)00595-7</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Berntson</surname>
<given-names>G. G.</given-names>
</name>
<name>
<surname>Cacioppo</surname>
<given-names>J.&#x20;T.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Heart Rate Variability: Stress and Psychiatric Conditions</article-title>. <source>Dynamic Electrocardiography</source>, <fpage>57</fpage>&#x2013;<lpage>64</lpage>. </citation>
</ref>
<ref id="B8">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bianco</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Napoletano</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Schettini</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Multimodal Car Driver Stress Recognition</article-title>,&#x201d; in <conf-name>Proc. International Conference on Pervasive Computing Technologies for Healthcare</conf-name>, <fpage>302</fpage>&#x2013;<lpage>307</lpage>. <pub-id pub-id-type="doi">10.1145/3329189.3329221</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brugnera</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zarbo</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Tarvainen</surname>
<given-names>M. P.</given-names>
</name>
<name>
<surname>Marchettini</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Adorni</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Compare</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Heart Rate Variability during Acute Psychosocial Stress: A Randomized Cross-Over Trial of Verbal and Non-verbal Laboratory Stressors</article-title>. <source>Int. J.&#x20;Psychophysiology</source> <volume>127</volume>, <fpage>17</fpage>&#x2013;<lpage>25</lpage>. <pub-id pub-id-type="doi">10.1016/j.ijpsycho.2018.02.016</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cho</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Bianchi-Berthouze</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Julier</surname>
<given-names>S. J.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Deepbreath: Deep Learning of Breathing Patterns for Automatic Stress Recognition Using Low-Cost thermal Imaging in Unconstrained Settings</article-title>,&#x201d; in <conf-name>2017 Seventh International Conference on Affective Computing and Intelligent Interaction</conf-name> (<publisher-loc>San Antonio, TX</publisher-loc>: <publisher-name>ACII</publisher-name>), <fpage>456</fpage>&#x2013;<lpage>463</lpage>. <pub-id pub-id-type="doi">10.1109/acii.2017.8273639</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cummins</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Baird</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Schuller</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>The Increasing Impact of Deep Learning on Speech Analysis for Health: Challenges and Opportunities</article-title>. <source>Methods Spec. Issue. Translational Data analytics Health Inform.</source> <volume>151</volume>, <fpage>41</fpage>&#x2013;<lpage>54</lpage>. <pub-id pub-id-type="doi">10.1016/j.ymeth.2018.07.007</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cuno</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Condori-Fernandez</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Mendoza</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lov&#xf3;n</surname>
<given-names>W. R.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>A Fair Evaluation of Public Datasets for Stress Detection Systems</article-title>,&#x201d; in <conf-name>2020 39th International Conference of the Chilean Computer Science</conf-name> (<publisher-loc>Coquimbo, Chile</publisher-loc>: <publisher-name>Society SCCC</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1109/SCCC51225.2020.9281274</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dalmeida</surname>
<given-names>K. M.</given-names>
</name>
<name>
<surname>Masala</surname>
<given-names>G. L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Hrv Features as Viable Physiological Markers for Stress Detection Using Wearable Devices</article-title>. <source>Sensors</source> <volume>21</volume>, <fpage>2873</fpage>. <pub-id pub-id-type="doi">10.3390/s21082873</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dhama</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Latheef</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Dadar</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Samad</surname>
<given-names>H. A.</given-names>
</name>
<name>
<surname>Munjal</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Khandia</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Biomarkers in Stress Related Diseases/disorders: Diagnostic, Prognostic, and Therapeutic Values</article-title>. <source>Front. Mol. Biosciences</source> <volume>6</volume>. <pub-id pub-id-type="doi">10.3389/fmolb.2019.00091</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dickerson</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Kemeny</surname>
<given-names>M. E.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Acute Stressors and Cortisol Responses: a Theoretical Integration and Synthesis of Laboratory Research</article-title>. <source>Psychol. Bull.</source> <volume>130</volume>, <fpage>355</fpage>. <pub-id pub-id-type="doi">10.1037/0033-2909.130.3.355</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Eyben</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Scherer</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Schuller</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Sundberg</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Andr&#xe9;</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Busso</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>The Geneva Minimalistic Acoustic Parameter Set (GeMAPS) for Voice Research and Affective Computing</article-title>. <source>IEEE Trans. Affective Comput.</source> <volume>7</volume>, <fpage>190</fpage>&#x2013;<lpage>202</lpage>. <pub-id pub-id-type="doi">10.1109/taffc.2015.2457417</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Eyben</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Weninger</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Gross</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Schuller</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2013</year>). &#x201c;<article-title>Recent Developments in openSMILE, the Munich Open-Source Multimedia Feature Extractor</article-title>,&#x201d; in <conf-name>Proc. International Conference Multimedia</conf-name> (<publisher-loc>Barcelona, Spain</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>835</fpage>&#x2013;<lpage>838</lpage>. <pub-id pub-id-type="doi">10.1145/2502081.2502224</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fendel</surname>
<given-names>J.&#x20;C.</given-names>
</name>
<name>
<surname>B&#xfc;rkle</surname>
<given-names>J.&#x20;J.</given-names>
</name>
<name>
<surname>G&#xf6;ritz</surname>
<given-names>A. S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Mindfulness-based Interventions to Reduce Burnout and Stress in Physicians: a Systematic Review and Meta-Analysis</article-title>. <source>Acad. Med.</source> <volume>96</volume>, <fpage>751</fpage>&#x2013;<lpage>764</lpage>. </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Garcia-Ceja</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Riegler</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Nordgreen</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Jakobsen</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Oedegaard</surname>
<given-names>K. J.</given-names>
</name>
<name>
<surname>T&#xf8;rresen</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Mental Health Monitoring with Multimodal Sensing and Machine Learning: A Survey</article-title>. <source>Pervasive Mobile Comput.</source> <volume>51</volume>, <fpage>1</fpage>&#x2013;<lpage>26</lpage>. <pub-id pub-id-type="doi">10.1016/j.pmcj.2018.09.003</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Gemmeke</surname>
<given-names>J.&#x20;F.</given-names>
</name>
<name>
<surname>Ellis</surname>
<given-names>D. P.</given-names>
</name>
<name>
<surname>Freedman</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jansen</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lawrence</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Moore</surname>
<given-names>R. C.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). &#x201c;<article-title>Audio Set: An Ontology and Human-Labeled Dataset for Audio Events</article-title>,&#x201d; in <conf-name>2017 IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>776</fpage>&#x2013;<lpage>780</lpage>. <pub-id pub-id-type="doi">10.1109/icassp.2017.7952261</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Giddens</surname>
<given-names>C. L.</given-names>
</name>
<name>
<surname>Barron</surname>
<given-names>K. W.</given-names>
</name>
<name>
<surname>Byrd-Craven</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Clark</surname>
<given-names>K. F.</given-names>
</name>
<name>
<surname>Winter</surname>
<given-names>A. S.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Vocal Indices of Stress: a Review</article-title>. <source>J.&#x20;voice</source> <volume>27</volume>, <fpage>390</fpage>&#x2013;<lpage>e21</lpage>. <pub-id pub-id-type="doi">10.1016/j.jvoice.2012.12.010</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goldstein</surname>
<given-names>D. S.</given-names>
</name>
</person-group> (<year>1987</year>). <article-title>Stress-induced Activation of the Sympathetic Nervous System</article-title>. <source>Bailliere&#x2019;s Clin. Endocrinol. Metab.</source> <volume>1</volume>, <fpage>253</fpage>&#x2013;<lpage>278</lpage>. <pub-id pub-id-type="doi">10.1016/s0950-351x(87)80063-0</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>G&#xf6;n&#xfc;late&#x15f;</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Tetik</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>D&#xfc;ndar</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Tansu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>D&#xfc;ndar</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Analyzing the before and after Effects of Endurance Training on Acth Hormone</article-title>. <source>Int. J.&#x20;Sport Cult. Sci.</source> <volume>5</volume>, <fpage>340</fpage>&#x2013;<lpage>346</lpage>. </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goodman</surname>
<given-names>W. K.</given-names>
</name>
<name>
<surname>Janson</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wolf</surname>
<given-names>J.&#x20;M.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Meta-analytical Assessment of the Effects of Protocol Variations on Cortisol Responses to the Trier Social Stress Test</article-title>. <source>J.&#x20;Psychoneuroendocrinology</source> <volume>80</volume>, <fpage>26</fpage>&#x2013;<lpage>35</lpage>. <pub-id pub-id-type="doi">10.1016/j.psyneuen.2017.02.030</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Grzadzielewska</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Using Machine Learning in Burnout Prediction: A Survey</article-title>. <source>Child. Adolesc. Soc. Work J.</source> <volume>38</volume>, <fpage>175</fpage>&#x2013;<lpage>180</lpage>. </citation>
</ref>
<ref id="B26">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hagerer</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Pandit</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Eyben</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Schuller</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Enhancing Lstm Rnn-Based Speech Overlap Detection by Artificially Mixed Data</article-title>,&#x201d; in <conf-name>Audio Engineering Society Conference: 2017 AES International Conference on Semantic Audio</conf-name>. <comment>no pagination</comment>. </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Haider</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>De La Fuente</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Luz</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>An Assessment of Paralinguistic Acoustic Features for Detection of Alzheimer&#x2019;s Dementia in Spontaneous Speech</article-title>. <source>IEEE J.&#x20;Selected Top. Signal Process.</source> <volume>14</volume>, <fpage>272</fpage>&#x2013;<lpage>281</lpage>. </citation>
</ref>
<ref id="B28">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Hansen</surname>
<given-names>J.&#x20;H.</given-names>
</name>
<name>
<surname>Bou-Ghazale</surname>
<given-names>S. E.</given-names>
</name>
</person-group> (<year>1997</year>). &#x201c;<article-title>Getting Started with Susas: A Speech under Simulated and Actual Stress Database</article-title>,&#x201d; in <source>Proc. Eurospeech.</source>, <fpage>1743</fpage>&#x2013;<lpage>1746</lpage>. </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Healey</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Picard</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Detecting Stress during Real-World Driving Tasks Using Physiological Sensors</article-title>. <source>IEEE Trans. Intell. Transportation Syst.</source> <volume>6</volume>, <fpage>156</fpage>&#x2013;<lpage>166</lpage>. <pub-id pub-id-type="doi">10.1109/TITS.2005.848368</pub-id> </citation>
</ref>
<ref id="B30">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Hershey</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chaudhuri</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ellis</surname>
<given-names>D. P.</given-names>
</name>
<name>
<surname>Gemmeke</surname>
<given-names>J.&#x20;F.</given-names>
</name>
<name>
<surname>Jansen</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Moore</surname>
<given-names>R. C.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). &#x201c;<article-title>Cnn Architectures for Large-Scale Audio Classification</article-title>,&#x201d; in <conf-name>2017 IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>131</fpage>&#x2013;<lpage>135</lpage>. <pub-id pub-id-type="doi">10.1109/icassp.2017.7952132</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ishii</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Otsuka</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kumano</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yamato</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Using Respiration to Predict Who Will Speak Next and when in Multiparty Meetings</article-title>. <source>ACM Trans. Interactive Intell. Syst. (Tiis)</source> <volume>6</volume>, <fpage>1</fpage>&#x2013;<lpage>20</lpage>. <pub-id pub-id-type="doi">10.1145/2946838</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Jati</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Williams</surname>
<given-names>P. G.</given-names>
</name>
<name>
<surname>Baucom</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Georgiou</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Towards Predicting Physiology from Speech during Stressful Conversations: Heart Rate and Respiratory Sinus Arrhythmia</article-title>,&#x201d; in <conf-name>2018 IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>4944</fpage>&#x2013;<lpage>4948</lpage>. <pub-id pub-id-type="doi">10.1109/icassp.2018.8461500</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Johnson</surname>
<given-names>A. K.</given-names>
</name>
<name>
<surname>Anderson</surname>
<given-names>E. A.</given-names>
</name>
</person-group> (<year>1990</year>). <source>Stress and Arousal</source>. <publisher-loc>Ithaca, NY</publisher-loc>: <publisher-name>APA PsycNET</publisher-name>. </citation>
</ref>
<ref id="B34">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kwon</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>O&#x2019;Connell</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2019</year>). <source>Toward Estimating Personal Well-Being Using Voice</source>.<comment>arXiv preprint arXiv:1910.10082</comment> </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kirschbaum</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Pirke</surname>
<given-names>K.-M.</given-names>
</name>
<name>
<surname>Hellhammer</surname>
<given-names>D. H.</given-names>
</name>
</person-group> (<year>1993</year>). <article-title>The &#x2018;Trier Social Stress Test&#x2019;&#x2013;A Tool for Investigating Psychobiological Stress Responses in a Laboratory Setting</article-title>. <source>J.&#x20;Neuropsychobiology</source> <volume>28</volume>, <fpage>76</fpage>&#x2013;<lpage>81</lpage>. <pub-id pub-id-type="doi">10.1159/000119004</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Koldijk</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sappelli</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Verberne</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Neerincx</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Kraaij</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>The Swell Knowledge Work Dataset for Stress and User Modeling Research</article-title>,&#x201d; in <conf-name>Proc. of the 16th international conference on multimodal interaction</conf-name>, <fpage>291</fpage>&#x2013;<lpage>298</lpage>. <pub-id pub-id-type="doi">10.1145/2663204.2663257</pub-id> </citation>
</ref>
<ref id="B37">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kovalenko</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kastyro</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Torshin</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Guhschina</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Doroginskaya</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Kamanina</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Comparison of Immediate Effects of Vocal Breathing Exercises and Physical Exercises on Heart Rate Variability in Healthy Students</article-title>,&#x201d; in <conf-name>Proc. Models and Analysis of Vocal Emissions for BioMedical Applications: International Workshop</conf-name> (<publisher-loc>Firenze, Italy</publisher-loc>: <publisher-name>Firenze University Press</publisher-name>), <fpage>245</fpage>. </citation>
</ref>
<ref id="B38">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Krizhevsky</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sutskever</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Hinton</surname>
<given-names>G. E.</given-names>
</name>
</person-group> (<year>2012</year>). &#x201c;<article-title>ImageNet Classification with Deep Convolutional Neural Networks</article-title>,&#x201d; in <conf-name>Advances in Neural Information Processing Systems 25</conf-name>. Editors <person-group person-group-type="editor">
<name>
<surname>Pereira</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Burges</surname>
<given-names>C. J.&#x20;C.</given-names>
</name>
<name>
<surname>Bottou</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Weinberger</surname>
<given-names>K. Q.</given-names>
</name>
</person-group> (<publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>Curran Associates, Inc.</publisher-name>), <fpage>1097</fpage>&#x2013;<lpage>1105</lpage>. </citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sharma</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Sharma</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021a</year>). <article-title>Hierarchical Deep Neural Network for Mental Stress State Detection Using Iot Based Biomarkers</article-title>. <source>Pattern Recognition Lett.</source> <volume>145</volume>, <fpage>81</fpage>&#x2013;<lpage>87</lpage>. <pub-id pub-id-type="doi">10.1016/j.patrec.2021.01.030</pub-id> </citation>
</ref>
<ref id="B40">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Iftekhar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Goebel</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bullock</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>MacLean</surname>
<given-names>M. H.</given-names>
</name>
<name>
<surname>Miller</surname>
<given-names>M. B.</given-names>
</name>
<etal/>
</person-group> (<year>2021b</year>). &#x201c;<article-title>Stressnet: Detecting Stress in thermal Videos</article-title>,&#x201d; in <conf-name>Proc. of International Conference on Applications of Computer Vision</conf-name>, <fpage>999</fpage>&#x2013;<lpage>1009</lpage>. <pub-id pub-id-type="doi">10.1109/wacv48630.2021.00104</pub-id> </citation>
</ref>
<ref id="B41">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Leistner</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Menke</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Hypothalamic&#x2013;pituitary&#x2013;adrenal axis and Stress</article-title>,&#x201d; in <source>Handbook of Clinical Neurology</source> (<publisher-name>Elsevier</publisher-name>), <volume>175</volume>, <fpage>55</fpage>&#x2013;<lpage>64</lpage>. <pub-id pub-id-type="doi">10.1016/b978-0-444-64123-6.00004-7</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Deep Learning Face Attributes in the Wild</article-title>,&#x201d; in <conf-name>Proc. of International Conference on Computer Vision</conf-name>. <comment>no pagination</comment>. <pub-id pub-id-type="doi">10.1109/iccv.2015.425</pub-id> </citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>MacLaughlin</surname>
<given-names>B. W.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Noone</surname>
<given-names>A.-M.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Harazduk</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Lumpkin</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Stress Biomarkers in Medical Students Participating in a Mind Body Medicine Skills Program</article-title>. <source>Evidence-Based Complement. Altern. Med.</source> <volume>2011</volume>. <pub-id pub-id-type="doi">10.1093/ecam/neq039</pub-id> </citation>
</ref>
<ref id="B44">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Mertes</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Baird</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Schiller</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Schuller</surname>
<given-names>B. W.</given-names>
</name>
<name>
<surname>Andr&#xe9;</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>An Evolutionary-Based Generative Approach for Audio Data Augmentation</article-title>,&#x201d; in <conf-name>Proc. 2020 IEEE 22nd International Workshop on Multimedia Signal Processing (MMSP)</conf-name>, <conf-loc>Tampere, Finland</conf-loc> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1109/mmsp48831.2020.9287156</pub-id> </citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miller</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Plessow</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Rauh</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Gr&#xf6;schl</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kirschbaum</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Comparison of Salivary Cortisol as Measured by Different Immunoassays and Tandem Mass Spectrometry</article-title>. <source>Psychoneuroendocrinology</source> <volume>38</volume>, <fpage>50</fpage>&#x2013;<lpage>57</lpage>. <pub-id pub-id-type="doi">10.1016/j.psyneuen.2012.04.019</pub-id> </citation>
</ref>
<ref id="B46">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Mousa</surname>
<given-names>A. E.-D.</given-names>
</name>
<name>
<surname>Schuller</surname>
<given-names>B. W.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep Bidirectional Long Short-Term Memory Recurrent Neural Networks for Grapheme-To-Phoneme Conversion Utilizing Complex many-to-many Alignments</article-title>,&#x201d; in <source>Interspeech</source>, <fpage>2836</fpage>&#x2013;<lpage>2840</lpage>. <pub-id pub-id-type="doi">10.21437/interspeech.2016-1229</pub-id> </citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nath</surname>
<given-names>R. K.</given-names>
</name>
<name>
<surname>Thapliyal</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Caban-Holt</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Machine Learning Based Stress Monitoring in Older Adults Using Wearable Sensors and Cortisol as Stress Biomarker</article-title>. <source>J.&#x20;Signal Process. Syst.</source>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1007/s11265-020-01611-5</pub-id> </citation>
</ref>
<ref id="B48">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Niu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Synrhythm: Learning a Deep Heart Rate Estimator from General to Specific</article-title>,&#x201d; in <conf-name>2018 24th International Conference on Pattern Recognition (ICPR)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>3580</fpage>&#x2013;<lpage>3585</lpage>. <pub-id pub-id-type="doi">10.1109/icpr.2018.8546321</pub-id> </citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Orlikoff</surname>
<given-names>R. F.</given-names>
</name>
<name>
<surname>Baken</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>1989</year>). <article-title>The Effect of the Heartbeat on Vocal Fundamental Frequency Perturbation</article-title>. <source>J.&#x20;Speech, Lang. Hearing Res.</source> <volume>32</volume>, <fpage>576</fpage>&#x2013;<lpage>582</lpage>. <pub-id pub-id-type="doi">10.1044/jshr.3203.576</pub-id> </citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pag&#xe1;n-Casta&#xf1;o</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Maseda-Moreno</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Santos-Rojo</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Wellbeing in Work Environments</article-title>. <source>J.&#x20;Business Res.</source> <volume>115</volume>, <fpage>469</fpage>&#x2013;<lpage>474</lpage>. <pub-id pub-id-type="doi">10.1016/j.jbusres.2019.12.007</pub-id> </citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Panicker</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Gayathri</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>A Survey of Machine Learning Techniques in Physiology Based Mental Stress Detection Systems</article-title>. <source>Biocybernetics Biomed. Eng.</source> <volume>39</volume>, <fpage>444</fpage>&#x2013;<lpage>469</lpage>. <pub-id pub-id-type="doi">10.1016/j.bbe.2019.01.004</pub-id> </citation>
</ref>
<ref id="B52">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Parkhi</surname>
<given-names>O. M.</given-names>
</name>
<name>
<surname>Vedaldi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zisserman</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Deep Face Recognition</article-title>,&#x201d; in <conf-name>Proc. of the British Machine Vision Conference</conf-name>, <fpage>1</fpage>&#x2013;<lpage>41</lpage>. <comment>12</comment>. <pub-id pub-id-type="doi">10.5244/c.29.41</pub-id>
<volume>41</volume> </citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pedregosa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Varoquaux</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gramfort</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Michel</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Thirion</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Grisel</surname>
<given-names>O.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Scikit-learn: Machine Learning in Python</article-title>. <source>J.&#x20;Machine Learn. Res.</source> <volume>12</volume>, <fpage>2825</fpage>&#x2013;<lpage>2830</lpage>. </citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pisanski</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Nowak</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sorokowski</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Individual Differences in Cortisol Stress Response Predict Increases in Voice Pitch during Exam Stress</article-title>. <source>Physiol. Behav.</source> <volume>163</volume>, <fpage>234</fpage>&#x2013;<lpage>238</lpage>. <pub-id pub-id-type="doi">10.1016/j.physbeh.2016.05.018</pub-id> </citation>
</ref>
<ref id="B55">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Plarre</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Raij</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hossain</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Ali</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Nakajima</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Al&#x2019;Absi</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). &#x201c;<article-title>Continuous Inference of Psychological Stress from Sensory Measurements Collected in the Natural Environment</article-title>,&#x201d; in <conf-name>Proc. of the 10th ACM/IEEE international conference on information processing in sensor networks</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>97</fpage>&#x2013;<lpage>108</lpage>. </citation>
</ref>
<ref id="B56">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Protopapas</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lieberman</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>1997</year>). <article-title>Fundamental Frequency of Phonation and Perceived Emotional Stress</article-title>. <source>The J.&#x20;Acoust. Soc. America</source> <volume>101</volume>, <fpage>2267</fpage>&#x2013;<lpage>2277</lpage>. <pub-id pub-id-type="doi">10.1121/1.418247</pub-id> </citation>
</ref>
<ref id="B57">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rodr&#xed;guez-Arce</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lara-Flores</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Portillo-Rodr&#xed;guez</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Mart&#xed;nez-M&#xe9;ndez</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Towards an Anxiety and Stress Recognition System for Academic Environments Based on Physiological Features</article-title>. <source>Comp. Methods Programs Biomed.</source> <volume>190</volume>, <fpage>105408</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2020.105408</pub-id> </citation>
</ref>
<ref id="B58">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rohleder</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Nater</surname>
<given-names>U. M.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Determinants of Salivary <italic>&#x3b1;</italic>-amylase in Humans and Methodological Considerations</article-title>. <source>Psychoneuroendocrinology</source> <volume>34</volume>, <fpage>469</fpage>&#x2013;<lpage>485</lpage>. <pub-id pub-id-type="doi">10.1016/j.psyneuen.2008.12.004</pub-id> </citation>
</ref>
<ref id="B59">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Russell</surname>
<given-names>J.&#x20;A.</given-names>
</name>
</person-group> (<year>1980</year>). <article-title>A Circumplex Model of Affect</article-title>. <source>J.&#x20;Personal. Soc. Psychol.</source> <volume>39</volume>, <fpage>1161</fpage>&#x2013;<lpage>1178</lpage>. <pub-id pub-id-type="doi">10.1037/h0077714</pub-id> </citation>
</ref>
<ref id="B60">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Saitis</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kalimeri</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Multimodal Classification of Stressful Environments in Visually Impaired Mobility Using Eeg and Peripheral Biosignals</article-title>. <source>IEEE Trans. Affective Comput.</source> <volume>12</volume>, <fpage>203</fpage>&#x2013;<lpage>214</lpage>. </citation>
</ref>
<ref id="B61">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>&#x160;alkevicius</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Dama&#x161;evi&#x10d;ius</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Maskeliunas</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Laukien&#x117;</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Anxiety Level Recognition for Virtual Reality Therapy System Using Physiological Signals</article-title>. <source>Electronics</source> <volume>8</volume>, <fpage>1039</fpage>. </citation>
</ref>
<ref id="B62">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sano</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Picard</surname>
<given-names>R. W.</given-names>
</name>
</person-group> (<year>2013</year>). &#x201c;<article-title>Stress Recognition Using Wearable Sensors and mobile Phones</article-title>,&#x201d; in <conf-name>2013 Humaine association conference on affective computing and intelligent interaction</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>671</fpage>&#x2013;<lpage>676</lpage>. <pub-id pub-id-type="doi">10.1109/acii.2013.117</pub-id> </citation>
</ref>
<ref id="B63">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sawilowsky</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Blair</surname>
<given-names>R. C.</given-names>
</name>
</person-group> (<year>1992</year>). <article-title>A More Realistic Look at the Robustness and Type Ii Error Properties of the T Test to Departures from Population Normality</article-title>. <source>Psychol. Bull.</source> <volume>111</volume>, <fpage>352</fpage>. <pub-id pub-id-type="doi">10.1037/0033-2909.111.2.352</pub-id> </citation>
</ref>
<ref id="B64">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Schmidt</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Reiss</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Duerichen</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Marberger</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Van Laerhoven</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Introducing Wesad, a Multimodal Dataset for Wearable Stress and Affect Detection</article-title>,&#x201d; in <conf-name>Proc. of International Conference on Multimodal Interaction</conf-name> (<publisher-loc>Boulder, CO, USA</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>), <fpage>400</fpage>&#x2013;<lpage>408</lpage>. <pub-id pub-id-type="doi">10.1145/3242969.3242985</pub-id> </citation>
</ref>
<ref id="B65">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Schuller</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Friedmann</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Eyben</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2013</year>). &#x201c;<article-title>Automatic Recognition of Physiological Parameters in the Human Voice: Heart Rate and Skin Conductance</article-title>,&#x201d; in <conf-name>Proc. International Conference on Acoustics, Speech and Signal Processing</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>7219</fpage>&#x2013;<lpage>7223</lpage>. <pub-id pub-id-type="doi">10.1109/icassp.2013.6639064</pub-id> </citation>
</ref>
<ref id="B66">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Schuller</surname>
<given-names>B. W.</given-names>
</name>
<name>
<surname>Batliner</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bergler</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Me&#x00DF;ner</surname>
<given-names>E.-M.</given-names>
</name>
<name>
<surname>Hamilton</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Amiriparian</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). &#x201c;<article-title>The Interspeech 2020 Computational Paralinguistics challenge: Elderly Emotion, Breathing &#x26; Masks</article-title>,&#x201d; in <conf-name>Proc. Interspeech 2020</conf-name> (<publisher-loc>Shanghai, China</publisher-loc>: <publisher-name>ISCA</publisher-name>), <fpage>2042</fpage>&#x2013;<lpage>2046</lpage>. <pub-id pub-id-type="doi">10.21437/interspeech.2020-32</pub-id> </citation>
</ref>
<ref id="B67">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sharma</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Sharma</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A Comprehensive Review and Analysis of Supervised-Learning and Soft Computing Techniques for Stress Diagnosis in Humans</article-title>. <source>Comput. Biol. Med.</source> <volume>134</volume>, <fpage>104450</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2021.104450</pub-id> </citation>
</ref>
<ref id="B68">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Simonyan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zisserman</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2014</year>). <source>Very Deep Convolutional Networks for Large-Scale Image Recognition</source>. <comment>arXiv preprint arXiv:1409.1556.</comment> </citation>
</ref>
<ref id="B69">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Smith</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tsiartas</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Shriberg</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Kathol</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Willoughby</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>de Zambotti</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Analysis and Prediction of Heart Rate Using Speech Features from Natural Speech</article-title>,&#x201d; in <conf-name>2017 IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>989</fpage>&#x2013;<lpage>993</lpage>. <pub-id pub-id-type="doi">10.1109/icassp.2017.7952304</pub-id> </citation>
</ref>
<ref id="B70">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Stappen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Baird</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Christ</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Schumann</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Sertolli</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Me&#x00DF;ner</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2021a</year>). &#x201c;<article-title>The MuSe 2021 Multimodal Sentiment Analysis Challenge: Sentiment, Emotion, Physiological-Emotion, and Stress</article-title>,&#x201d; in <conf-name>Proc. 2nd International on Multimodal Sentiment Analysis in Real-life Media Challenge and Workshop</conf-name> (<publisher-loc>Chengdu, China</publisher-loc>: <publisher-name>ACM</publisher-name>). <comment>[to appear]</comment>. </citation>
</ref>
<ref id="B71">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Stappen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Baird</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Schumann</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Schuller</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2021b</year>). &#x201c;<article-title>The Multimodal Sentiment Analysis in Car Reviews (Muse-car) Dataset: Collection, Insights and Improvements</article-title>,&#x201d; in <source>IEEE Transactions on Affective Computing</source>. <pub-id pub-id-type="doi">10.1109/taffc.2021.3097002</pub-id> </citation>
</ref>
<ref id="B72">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Stappen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Schumann</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Sertolli</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Baird</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Weigel</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Cambria</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2021c</year>). &#x201c;<article-title>Muse-toolbox: The Multimodal Sentiment Analysis Continuous Annotation Fusion and Discrete Class Transformation Toolbox</article-title>,&#x201d; in <conf-name>Proc. 2nd International on Multimodal Sentiment Analysis in Real-life Media Challenge and Workshop</conf-name> (<publisher-loc>Chengdu, China</publisher-loc>: <publisher-name>ACM</publisher-name>). <comment>[to appear]</comment>. </citation>
</ref>
<ref id="B73">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Suess</surname>
<given-names>W. M.</given-names>
</name>
<name>
<surname>Alexander</surname>
<given-names>A. B.</given-names>
</name>
<name>
<surname>Smith</surname>
<given-names>D. D.</given-names>
</name>
<name>
<surname>Sweeney</surname>
<given-names>H. W.</given-names>
</name>
<name>
<surname>Marion</surname>
<given-names>R. J.</given-names>
</name>
</person-group> (<year>1980</year>). <article-title>The Effects of Psychological Stress on Respiration: a Preliminary Study of Anxiety and Hyperventilation</article-title>. <source>Psychophysiology</source> <volume>17</volume>, <fpage>535</fpage>&#x2013;<lpage>540</lpage>. <pub-id pub-id-type="doi">10.1111/j.1469-8986.1980.tb02293.x</pub-id> </citation>
</ref>
<ref id="B74">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lian</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Niu</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Multi-modal Continuous Dimensional Emotion Recognition Using Recurrent Neural Network and Self-Attention Mechanism</article-title>,&#x201d; in <conf-name>Proc. Multimodal Sentiment Analysis in Real-life Media Challenge and Workshop</conf-name>, <fpage>27</fpage>&#x2013;<lpage>34</lpage>. <pub-id pub-id-type="doi">10.1145/3423327.3423672</pub-id> </citation>
</ref>
<ref id="B75">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Taelman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Vandeput</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Spaepen</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Van Huffel</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2009</year>). &#x201c;<article-title>Influence of Mental Stress on Heart Rate and Heart Rate Variability</article-title>,&#x201d; in <conf-name>4th European conference of the international federation for medical and biological engineering</conf-name> (<publisher-name>Springer</publisher-name>), <fpage>1366</fpage>&#x2013;<lpage>1369</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-540-89208-3_324</pub-id> </citation>
</ref>
<ref id="B76">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Triantafyllopoulos</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Schuller</surname>
<given-names>B. W.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Deep Speaker Conditioning for Speech Emotion Recognition</article-title>,&#x201d; in <conf-name>2021 IEEE International Conference on Multimedia and Expo (ICME)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1109/icme51207.2021.9428217</pub-id> </citation>
</ref>
<ref id="B77">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Wittchen</surname>
<given-names>H.-U.</given-names>
</name>
<name>
<surname>Zaudig</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Fydrich</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>1997</year>). <source>SKID. Strukturiertes Klinisches Interview f&#xfc;r DSM-IV. Achse I und II</source>. <publisher-loc>Handanweisung. (Hogrefe)</publisher-loc>. </citation>
</ref>
<ref id="B78">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Courtney</surname>
<given-names>C. G.</given-names>
</name>
<name>
<surname>Lance</surname>
<given-names>B. J.</given-names>
</name>
<name>
<surname>Narayanan</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Dawson</surname>
<given-names>M. E.</given-names>
</name>
<name>
<surname>Oie</surname>
<given-names>K. S.</given-names>
</name>
<etal/>
</person-group> (<year>2010</year>). <article-title>Optimal Arousal Identification and Classification for Affective Computing Using Physiological Signals: Virtual Reality Stroop Task</article-title>. <source>IEEE Trans. Affective Comput.</source> <volume>1</volume>, <fpage>109</fpage>&#x2013;<lpage>118</lpage>. <pub-id pub-id-type="doi">10.1109/t-affc.2010.12</pub-id> </citation>
</ref>
<ref id="B79">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Loy</surname>
<given-names>C. C.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>WIDER FACE: A Face Detection Benchmark</article-title>. <source>CoRR abs/</source>
<volume>1511</volume>, <fpage>06523</fpage>. </citation>
</ref>
<ref id="B80">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zafar</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Potential Biomarkers of Emotional Stress Induced Neurodegeneration</article-title>. <source>eNeurologicalSci</source> <volume>21</volume>, <fpage>100292</fpage>. <pub-id pub-id-type="doi">10.1016/j.ensci.2020.100292</pub-id> </citation>
</ref>
<ref id="B81">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Z&#xe4;nkert</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kudielka</surname>
<given-names>B. M.</given-names>
</name>
<name>
<surname>Wuest</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Effect of Sugar Administration on Cortisol Responses to Acute Psychosocial Stress</article-title>. <source>Psychoneuroendocrinology</source> <volume>115</volume>, <fpage>104607</fpage>. <pub-id pub-id-type="doi">10.1016/j.psyneuen.2020.104607</pub-id> </citation>
</ref>
<ref id="B82">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Qiao</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Joint Face Detection and Alignment Using Multitask Cascaded Convolutional Networks</article-title>. <source>IEEE Signal. Process. Lett.</source> <volume>23</volume>. <pub-id pub-id-type="doi">10.1109/lsp.2016.2603342</pub-id> </citation>
</ref>
<ref id="B83">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhan</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Respiration-based Emotion Recognition with Deep Learning</article-title>. <source>Comput. Industry</source> <volume>92</volume>, <fpage>84</fpage>&#x2013;<lpage>90</lpage>. <pub-id pub-id-type="doi">10.1016/j.compind.2017.04.005</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>