@ARTICLE{10.3389/fcomp.2020.00013, AUTHOR={Li, Jeng-Lin and Huang, Tzu-Yun and Chang, Chun-Min and Lee, Chi-Chun}, TITLE={A Waveform-Feature Dual Branch Acoustic Embedding Network for Emotion Recognition}, JOURNAL={Frontiers in Computer Science}, VOLUME={2}, YEAR={2020}, URL={https://www.frontiersin.org/articles/10.3389/fcomp.2020.00013}, DOI={10.3389/fcomp.2020.00013}, ISSN={2624-9898}, ABSTRACT={Research in advancing speech emotion recognition (SER) has attracted a lot of attention due to its critical role for better human behaviors understanding scientifically and comprehensive applications commercially. Conventionally, performing SER highly relies on hand-crafted acoustic features. The recent progress in deep learning has attempted to model emotion directly from raw waveform in an end-to-end learning scheme; however, this particular approach remains to be generally a sub-optimal approach. An alternative direction has been proposed to enhance and augment the knowledge-based acoustic representation with affect-related representation derived directly from raw waveform. Here, we propose a complimentary waveform-feature dual branch learning network, termed as Dual-Complementary Acoustic Embedding Network (DCaEN), to effectively integrate psychoacoustic knowledge and raw waveform embedding within an augmented feature space learning approach. DCaEN contains an acoustic feature embedding network and a raw waveform network, that is learned by integrating negative cosine distance constraint in the loss function. The experiment results show that DCaEN can achieve 59.31 an 46.73% unweighted average recall (UAR) in the USC IEMOCAP and the MSP-IMPROV speech emotion databases, which improves the performance compared to modeling either acoustic hand-crafted features or raw waveform only and without this particular loss constraint. Further analysis illustrates a reverse mirroring pattern in the learned latent space demonstrating the complementary nature of DCaEN feature space learning.} }