<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Sig. Proc.</journal-id>
<journal-title>Frontiers in Signal Processing</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Sig. Proc.</abbrev-journal-title>
<issn pub-type="epub">2673-8198</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">842925</article-id>
<article-id pub-id-type="doi">10.3389/frsip.2022.842925</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Signal Processing</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>AL-Net: Asymmetric Lightweight Network for Medical Image Segmentation</article-title>
<alt-title alt-title-type="left-running-head">Du et al.</alt-title>
<alt-title alt-title-type="right-running-head">AL-Net</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Du</surname>
<given-names>Xiaogang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1509886/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Nie</surname>
<given-names>Yinyin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1596653/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Fuhai</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Lei</surname>
<given-names>Tao</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1289865/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wang</surname>
<given-names>Song</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1293071/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Xuejun</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Shaanxi Joint Laboratory of Artificial Intelligence</institution>, <institution>Shaanxi University of Science and Technology</institution>, <addr-line>Xi&#x2019;an</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>The School of Electronic Information and Artificial Intelligence</institution>, <institution>Shaanxi University of Science and Technology</institution>, <addr-line>Xi&#x2019;an</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>The School of Electronic and Information Engineering</institution>, <institution>Lanzhou Jiaotong University</institution>, <addr-line>Lanzhou</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/254056/overview">Jong-Seok Lee</ext-link>, Yonsei University, South Korea</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/661492/overview">Jakub Nalepa</ext-link>, Silesian University of Technology, Poland</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1718613/overview">Yeejin Lee</ext-link>, Seoul National University of Science and Technology, South Korea</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Tao Lei, <email>leitao@sust.edu.cn</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Image Processing, a section of the journal Frontiers in Signal Processing</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>02</day>
<month>05</month>
<year>2022</year>
</pub-date>
<pub-date pub-type="collection">
<year>2022</year>
</pub-date>
<volume>2</volume>
<elocation-id>842925</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>12</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>21</day>
<month>03</month>
<year>2022</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2022 Du, Nie, Wang, Lei, Wang and Zhang.</copyright-statement>
<copyright-year>2022</copyright-year>
<copyright-holder>Du, Nie, Wang, Lei, Wang and Zhang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Medical image segmentation plays an important role in clinical applications, such as disease diagnosis and treatment planning. On the premise of ensuring segmentation accuracy, segmentation speed is also an important factor to improve diagnosis efficiency. Many medical image segmentation models based on deep learning can improve the segmentation accuracy, but ignore the model complexity and inference speed resulting in the failure of meeting the high real-time requirements of clinical applications. To address this problem, an asymmetric lightweight medical image segmentation network, namely AL-Net for short, is proposed in this paper. Firstly, AL-Net employs the pre-training RepVGG-A1 to extract rich semantic features, and reduces the channel processing to ensure the lower model complexity. Secondly, AL-Net introduces the lightweight atrous spatial pyramid pooling module as the context extractor, and combines the attention mechanism to capture the context information. Thirdly, a novel asymmetric decoder is proposed and introduced into AL-Net, which not only effectively eliminates redundant features, but also makes use of low-level features of images to improve the performance of AL-Net. Finally, the reparameterization technology is utilized in the inference stage, which effectively reduces the parameters of AL-Net and improves the inference speed of AL-Net without reducing the segmentation accuracy. The experimental results on retinal vessel, cell contour, and skin lesions segmentation datasets show that AL-Net is superior to the state-of-the-art models in terms of accuracy, parameters and inference speed.</p>
</abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>convolutional neural network</kwd>
<kwd>medical image segmentation</kwd>
<kwd>lightweight model</kwd>
<kwd>contextual encoder</kwd>
<kwd>asymmetric decoder</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Medical image segmentation refers to the process of dividing medical images into several non-overlapping regions according to some similarity characteristics of medical images. Medical image segmentation is of great significance for understanding the content of medical images and discovering lesion objects. It is not only the basis of biomedical image analysis, such as medical image registration and 3D reconstruction, but also plays an extremely important role in clinical diagnosis and treatment.</p>
<p>In recent years, with the development of deep learning, medical image segmentation based on deep learning has made remarkable progress and become a hot topic in the field of medical image analysis. Many classical semantic segmentation models (<xref ref-type="bibr" rid="B27">Liu et al., 2015</xref>; <xref ref-type="bibr" rid="B31">Ronneberger et al., 2015</xref>; <xref ref-type="bibr" rid="B34">Shelhamer et al., 2015</xref>; <xref ref-type="bibr" rid="B6">Chen et al., 2017</xref>; <xref ref-type="bibr" rid="B5">Chen et al., 2018</xref>; <xref ref-type="bibr" rid="B9">Gu et al., 2019</xref>; <xref ref-type="bibr" rid="B48">Zhou et al., 2019</xref>; <xref ref-type="bibr" rid="B49">Zhou et al., 2020</xref>; <xref ref-type="bibr" rid="B17">Chen et al., 2021</xref>) usually adopt the idea of extracting pixel-level features, such as the end-to-end fully convolutional network (FCN) (<xref ref-type="bibr" rid="B34">Shelhamer et al., 2015</xref>) and U-shape Net (U-Net) (<xref ref-type="bibr" rid="B31">Ronneberger et al., 2015</xref>). The above two types of segmentation models are mainly composed of encoder and decoder. Meanwhile, more and more contextual feature extraction modules are also employed in medical image segmentation. Firstly, Medical image segmentation models usually employ the most popular feature extractors (<xref ref-type="bibr" rid="B34">Shelhamer et al., 2015</xref>); (<xref ref-type="bibr" rid="B9">Gu et al., 2019</xref>); (<xref ref-type="bibr" rid="B35">Simonyan and Zisserman, 2014</xref>); (<xref ref-type="bibr" rid="B10">He et al., 2016</xref>); (<xref ref-type="bibr" rid="B20">Valanarasu et al., 2021</xref>) as encoders, such as VGG and ResNet, but the improvement of segmentation accuracy usually leads to the increase of model complexity. Secondly, context information is indispensable for image feature extraction. At present, most prominent semantic feature extraction modules are implemented based on dilated convolution (<xref ref-type="bibr" rid="B5">Chen et al., 2018</xref>); (<xref ref-type="bibr" rid="B9">Gu et al., 2019</xref>) and multi-scale pooling (<xref ref-type="bibr" rid="B27">Liu et al., 2015</xref>); (<xref ref-type="bibr" rid="B9">Gu et al., 2019</xref>); (<xref ref-type="bibr" rid="B18">Jie et al., 2018</xref>). In order to effectively focus on semantic features, attention mechanism is widely used to extract semantic information (<xref ref-type="bibr" rid="B25">Li et al., 2019</xref>); (<xref ref-type="bibr" rid="B30">Ni et al., 2019</xref>); (<xref ref-type="bibr" rid="B21">Le et al., 2020</xref>). Thirdly, medical image segmentation models are mostly improved on the basis of U-Net (<xref ref-type="bibr" rid="B31">Ronneberger et al., 2015</xref>); (<xref ref-type="bibr" rid="B49">Zhou et al., 2020</xref>). U-Net uses skip connection to effectively supplement low-level features, but it leads to information redundancy. In addition, on the basis of ensuring the segmentation accuracy, the segmentation speed is an important factor in applying the medical image segmentation model to clinical treatment. However, these models ignore the inference speed and model complexity to pursue the segmentation accuracy, they are not suitable for some clinical applications, such as image-guided surgery, online adaptive radiotherapy and real-time disease monitoring, which have high real-time requirements for image segmentation task.</p>
<p>To solve the above problems, we propose a lightweight asymmetric medical image segmentation network, namely AL-Net for short. Our main contributions are summarized as follows.</p>
<list list-type="simple">
<list-item>
<p>1) We introduce RepVGG-A1 as the encoder of AL-Net to extract powerful semantic features, and select Lite R-ASPP as the context information extraction module to ensure that the model can effectively capture the context features and has smaller parameters and lower model complexity.</p>
</list-item>
<list-item>
<p>2) We design an asymmetric decoder using skip connection and convolution operation for medical image segmentation. This decoder not only fully integrate the low-level features of images, but also eliminate the feature redundancy to further improve the segmentation accuracy.</p>
</list-item>
<list-item>
<p>3) We integrate the re-parameterization technology in the inference stage of AL-Net. Therefore, the inference model of AL-Net has only 3.45&#xa0;M parameters, and achieves the best balance between speed and accuracy on the dataset of retinal vessel, cell contour and skin image, respectively, which is better than the existing models.</p>
</list-item>
</list>
<p>The structure of the remainder of this paper is organized as follows. <xref ref-type="sec" rid="s2">Section 2</xref> introduces the related work of this paper. <xref ref-type="sec" rid="s3">Section 3</xref> mainly describes the AL-Net in detail. <xref ref-type="sec" rid="s4">Section 4</xref> demonstrates the performance of AL-Net. Finally, <xref ref-type="sec" rid="s5">Section 5</xref> makes the conclusion for this paper.</p>
</sec>
<sec id="s2">
<title>2 Related Work</title>
<p>In recent years, medical image segmentation based on deep learning has made great progress. In this section, we mainly introduce three general components of medical image segmentation network and the popular lightweight architecture.</p>
<sec id="s2-1">
<title>2.1 General Components of Network</title>
<p>Medical image segmentation network usually includes encoder, decoder and context extraction module. In this section, we discuss these modules in detail.</p>
<p>
<italic>Encoder:</italic> The semantic segmentation model based on deep learning (<xref ref-type="bibr" rid="B36">Szegedy et al., 2016a</xref>; <xref ref-type="bibr" rid="B22">Le et al., 2019</xref>; <xref ref-type="bibr" rid="B19">Jns et al., 2020</xref>) uses the encoder to extract high-level semantic information. U-Net selects the most powerful convolutional neural network VGG (<xref ref-type="bibr" rid="B35">Simonyan and Zisserman, 2014</xref>) as the encoder to capture high-level semantic information, but VGG limits the richness of image features due to the simple structure (<xref ref-type="bibr" rid="B31">Ronneberger et al., 2015</xref>). Because more and more powerful convolutional neural networks are proposed, the medical image segmentation network can choose a more advanced convolutional neural network as the backbone to extract more abundant image features (<xref ref-type="bibr" rid="B9">Gu et al., 2019</xref>); (<xref ref-type="bibr" rid="B49">Zhou et al., 2020</xref>); (<xref ref-type="bibr" rid="B17">Chen et al., 2021</xref>). For example, Context Encoder Network (CE-Net) (<xref ref-type="bibr" rid="B9">Gu et al., 2019</xref>) selects ResNet-34 (<xref ref-type="bibr" rid="B10">He et al., 2016</xref>) as the encoder, because the parameters of ResNet-34 are moderate and gradient dispersion can be avoided through residual connection. The backbone of U-Net&#x2b;&#x2b; (<xref ref-type="bibr" rid="B49">Zhou et al., 2020</xref>) is ResNet-101 with deeper network layers. TransUNet (<xref ref-type="bibr" rid="B17">Chen et al., 2021</xref>) added transformers to the encoder to extract more advanced features. However, these networks have huge structures and many parameters, which makes the model training and inference process consume a long time and computational resources. In order to further reduce the training and inference time of the network within limited computing resources, a series of lightweight backbones have emerged, such as Inception (<xref ref-type="bibr" rid="B36">Szegedy et al., 2016a</xref>); (<xref ref-type="bibr" rid="B37">Szegedy et al., 2015</xref>); (<xref ref-type="bibr" rid="B38">Szegedy et al., 2016b</xref>); (<xref ref-type="bibr" rid="B33">Sergey IoffeSzegedy, 2015</xref>), DenseNet (<xref ref-type="bibr" rid="B14">Huang et al., 2017</xref>) and RefineNet (<xref ref-type="bibr" rid="B29">Nekrasov et al., 2018</xref>); (<xref ref-type="bibr" rid="B26">Lin et al., 2017</xref>). Although the lightweight structure of the network is realized using these lightweight backbones, the segmentation accuracy has made an unexpected sacrifice. Re-parameterization technology can effectively avoid the contradiction between model lightweight and segmentation accuracy. Recently, Ding et al. (<xref ref-type="bibr" rid="B8">Ding et al., 2021</xref>) use re-parameterization technology to realize multi-branch training and single branch inference, which opens up another way for the selection of encoder.</p>
<p>
<italic>Decoder:</italic> The decoder is used to recover the spatial information of images step by step, but the earliest decoder only performs up-sampling, which will lead to the inability to recover the spatial information of images. Then, U-Net (<xref ref-type="bibr" rid="B31">Ronneberger et al., 2015</xref>) proposes a U-shaped decoder, which is composed of up-sampling and skip connection to supplement the detailed information lost in the encoder stage. However, the simple connection is easy to cause the loss of important semantic information in the process of high-level and low-level semantic information fusion. To solve this problem, scholars have proposed a variety of decoders to improve feature fusion (<xref ref-type="bibr" rid="B15">Ibtehaz and Rahman, 2020</xref>); (<xref ref-type="bibr" rid="B2">Alom et al., 2019</xref>); (<xref ref-type="bibr" rid="B47">Zheng et al., 2020</xref>). Nabil et al. (<xref ref-type="bibr" rid="B15">Ibtehaz and Rahman, 2020</xref>) used the residual path to replace the skip connection of the U-shaped decoder in the decoder of multiResUnet, so as to eliminate the semantic difference caused by the fusion of the low-level features of the encoder and the high-level features of the decoder. <xref ref-type="bibr" rid="B45">Zhou et al. (2020)</xref> improved the decoding ways of U-Net and proposed U-Net&#x2b;&#x2b; with nested dense skip connection path with deep monitoring. <xref ref-type="bibr" rid="B2">Alom et al. (2019)</xref> added a dual attention mechanism composed of spatial and channel attention in the last two layers of the decoder. <xref ref-type="bibr" rid="B47">Zheng et al. (2020)</xref> applied transformer to image segmentation and designed three different decoders based on the output serialization characteristics of transformer. However, these works only focus on improving the segmentation accuracy, ignoring the issue that many branch structures lead to a significantly slow inference speed.</p>
<p>
<italic>Context extraction module:</italic> To maintain the semantic information extracted in the encoding stage, many modules for extracting image context information are proposed. ParseNet fuses global context information from image level to solve the problem of insufficient actual receptive field (<xref ref-type="bibr" rid="B27">Liu et al., 2015</xref>). DeepLabv2 proposes the atrous spatial pyramid pooling (ASPP) module to effectively capture contextual features by expanding receptive fields (<xref ref-type="bibr" rid="B5">Chen et al., 2018</xref>). DeepLabv3 combines image level information and employs parallel atrous convolution layers with different dilated rates to capture multi-scale information (<xref ref-type="bibr" rid="B6">Chen et al., 2017</xref>). Nekrasov et al. (<xref ref-type="bibr" rid="B29">Nekrasov et al., 2018</xref>) designed the chained residual pooling (CRP) module and used it to capture context features from high-resolution images and improve the performance of semantic segmentation. To solve the problem of object size change in image segmentation, Gu et al. (<xref ref-type="bibr" rid="B9">Gu et al., 2019</xref>) proposed dense atrous convolution (DAC) module and residual multi-kernel pooling (RMP) module, which rely on the effective receptive fields to detect objects with different sizes. However, most of these modules only retain context information. For medical images with complex background, it is of great significance to focus the objects with sufficient context information. Hu et al. (<xref ref-type="bibr" rid="B18">Jie et al., 2018</xref>) proposed the squeeze and excitation (SE) module, which can automatically improve the useful features according to the importance and suppress the features that contribute less to the current task, so as to enhance the features and improve the segmentation performance.</p>
</sec>
<sec id="s2-2">
<title>2.2 Lightweight Segmentation</title>
<p>In recent years, the lightweight design of semantic segmentation network has gradually become a hot topic of image segmentation task, which has attracted the attention of many scholars. SegNAS3D (<xref ref-type="bibr" rid="B41">Wong and Moradi, 2019</xref>) uses network architecture search to solve the problem of network structure optimization in 3D image segmentation, which greatly reduces the complexity of model. In order to pursue the real-time performance of the model, <xref ref-type="bibr" rid="B29">Nekrasov et al. (2018)</xref> employed the lightweight RefineNet (<xref ref-type="bibr" rid="B26">Lin et al., 2017</xref>) as the backbone network. ICNet (<xref ref-type="bibr" rid="B46">Zhao et al., 2018</xref>) uses image cascade and branch training to accelerate the convergence of model. BiSeNet (<xref ref-type="bibr" rid="B43">Yu et al., 2018</xref>); (<xref ref-type="bibr" rid="B42">Yu et al., 2021</xref>) realized a lightweight model based on double branch structure, which uses different paths to extract spatial and semantic information. In addition, other models use common components to reduce the amount of computation. For example, DMFNet (<xref ref-type="bibr" rid="B4">Chen et al., 2019</xref>) divides the channels into multiple groups, and introduces weighted three-dimensional extended convolution to reduce parameters and improve the inference efficiency of model. Xception (<xref ref-type="bibr" rid="B7">Chollet, 2017</xref>) and MobileNets (<xref ref-type="bibr" rid="B12">Howard et al., 2017</xref>); (<xref ref-type="bibr" rid="B32">Sandler et al., 2018</xref>) employed deep separable convolution to effectively improve the inference speed. Dense-Inception U-Net (<xref ref-type="bibr" rid="B45">Zhang et al., 2020</xref>) combines the lightweight backbone Inception and dense module to extract high-level semantic information with lightweight encoder. ShuffleNets (<xref ref-type="bibr" rid="B28">Ma et al., 2018</xref>); (<xref ref-type="bibr" rid="B44">Zhang et al., 2018</xref>) proposed group convolution and channel shuffling, which greatly reduced the computational cost compared with the advanced models. However, lightweight segmentation networks for relatively complex medical images are less than them for natural images.</p>
<p>Some scholars have also designed lightweight segmentation networks for medical images. However, on the premise of ensuring accuracy, there are few medical image segmentation networks that achieve both low complexity and high inference speed. nnU-Net (<xref ref-type="bibr" rid="B16">Isensee et al., 2020</xref>) improves the adaptability of the network by preprocessing the data and post-processing the segmentation results, but comes at cost of increasing model parameters. U-Net&#x2b;&#x2b; (<xref ref-type="bibr" rid="B49">Zhou et al., 2020</xref>) uses small parameters to achieve good segmentation accuracy, but ignores the inference time of the model. And lightweight V-Net (<xref ref-type="bibr" rid="B23">Lei et al., 2020</xref>) guarantees segmentation accuracy and fewer parameters by employing depth-wise convolution and point-wise convolution, but does not improve the inference time of the model. In addition, <xref ref-type="bibr" rid="B39">Tarasiewicz et al. (2021)</xref> trained multiple skinny networks over all image planes and proposed Lightweight U-Nets, which obtains accurate brain tumor delineation from multi-modal MRIs. PyConvU-Net (<xref ref-type="bibr" rid="B24">Li et al., 2021</xref>) replaces all conventional convolution layers in the U-Net with the pyramidal convolution, which makes the segmentation accuracy better and the parameters less. However, the inference speed of PyConvU-Net still needs to be improved.</p>
</sec>
</sec>
<sec id="s3">
<title>3 Asymmetric Lightweight Network</title>
<p>To ensure the segmentation accuracy and improve the segmentation speed, we proposed an asymmetric lightweight network for medical image segmentation. <xref ref-type="fig" rid="F1">Figure 1</xref> shows the network structure of the proposed AL-Net. In <xref ref-type="fig" rid="F1">Figure 1</xref>, AL-Net consists of three important components, which are encoder, semantic extraction module and decoder. Compared to other classical models, the encoder of AL-Net does not involve residual connection, which makes AL-Net occupy less memory in the training stage. Meanwhile, AL-Net improves the ability of feature representation and generalization by designing multi-branch parallel structure in each convolution layer. The semantic extraction module is usually a pyramid structure, and the semantic extraction module employed by AL-Net are LR-ASPP (<xref ref-type="bibr" rid="B13">Howard et al., 2019</xref>). Compared with other pyramid modules, it can not only capture different-sized objects by the multi-core effective receptive field, but also combine the attention mechanism to more effectively deal with the low-contrast medical image segmentation task. In addition, LR-ASPP also uses a large pooling kernel with a large stride and only one 1 &#xd7; 1 convolution, which save some computation and make AL-Net more lightweight. In order to effectively map the low-resolution features of the encoding stage to the pixel-level classification features with the original resolution in the decoding stage, we are inspired by the DeepLabV3&#x2b; and U-Net, and design a decoder with asymmetric structure and apply it to AL-Net, which is more suitable for medical image segmentation. This decoder uses 3 &#xd7; 3 convolutions on the basis of U-shaped decoder instead of using skip connections directly, which can not only fully integrate the low-level semantic information, but also effectively eliminate the redundant features of images. It is more conducive to extract accurately object contours of medical images. In addition, in the inference stage of AL-Net, we employ the re-parameterization technology and optimize the multi-branch structure of encoder to single-branch structure, which improves the inference speed of AL-Net.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>The structure of AL-Net. After the training of AL-Net, each RepVGG block in the encoder is processed by reparameterization technology, so as to improve the inference efficiency of AL-Net.</p>
</caption>
<graphic xlink:href="frsip-02-842925-g001.tif"/>
</fig>
<sec id="s3-1">
<title>3.1 Encoder Module</title>
<p>The encoder plays an important role in image segmentation and feature extraction. The early medical image segmentation network, such as U-Net, chose VGG as encoder, which is always composed of convolution, ReLU and pooling. With the development of deep learning technology, the encoders of medical image segmentation network usually choose better modules, such as Inception, ResNet and DenseNet, which make the medical image segmentation model more and more complex. Although complex models may have higher accuracy than simple models, the multi-branch structure of complex models makes the model difficult to implement, and increases the inference time and memory utilization. In order to guarantee the segmentation accuracy and reduce the model complexity, the encoder of AL-Net employs the RepVGG-A1, which has the same ability of feature representation as Res-Net34 but has fewer parameters. RepVGG block is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. The RepVGG-A1 is designed for the image classification task of ImageNet, and there are few classes for medical image segmentation task. Therefore, there is channel redundancy when RepVGG-A1 is utilized as the backbone of AL-Net. We decrease the channels of the backbone of AL-Net without significantly reducing performance. Specially, the stride convolution is used by the RepVGG to replace the pooling operation, which avoids the possibility of losing the spatial information of images. In addition, the RepVGG can employ re-parameterization technology to transform multi-branch structure into single-branch structure to improve the inference speed effectively.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>RepVGG block. RepVGG-A1 is divided into five stages, and the number of the layers of each stage are 1, 2, 4, 14 and 1, respectively. Training stage (Left): in the first layer of each stage, down sampling is carried out through convolution with step size of 2, and there is no identity branch. Inference stage (Right): AL-Net becomes a single branch after reparameterization. Only the three layers of one stage are shown here.</p>
</caption>
<graphic xlink:href="frsip-02-842925-g002.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>3.2 Context Extractor Module</title>
<p>The context extraction module is used to extract contextual semantic information and generate more high-level feature maps. Currently popular context extraction modules, such as ASPP, can enrich spatial information, but do not have a specific direction of feature response. Medical images have the characteristics of high complexity, lack of simple linear features, and the gray of the background and objects are similar. Therefore, compared with natural images, it is more necessary for medical images to optimize the channel dimension. In this paper, the Lite R-ASPP firstly discards the atrous convolution that spends a lot of computational cost. Then, Lite R-ASPP can realize the information integration between channels by simplifying the four branches into two branches. Lite R-ASPP employs the global average pooling to prevent over fitting by regularizing the structure of the whole network. The global context information and semantic features are extracted by global average pooling to realize the global distribution on the feature channel. The feature is compressed into an attention vector. Finally, hard sigmoid with faster computational speed is employed as the activation function to realize the weight normalization, so as to recalibrate the semantic dependencies of the original features in the channel dimension, and highlight the key features and filter the background information. In addition, Lite R-ASPP uses only 1 &#xd7; 1 convolution, which effectively reduces the parameters. Lite R-ASPP is shown in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>The structure of Lite R-ASPP. H-Sigmoid stands for the Hard Sigmoid function.</p>
</caption>
<graphic xlink:href="frsip-02-842925-g003.tif"/>
</fig>
</sec>
<sec id="s3-3">
<title>3.3 Decoder Module</title>
<p>In the popular architecture of image segmentation networks, the decoder only has a simple upsampling process, which may lead to the loss of spatial information. To address this issue, U-Net uses skip connection to fuse the feature maps in the encoding and decoding stage to obtain richer spatial information. However, this connection mode produces many redundant low-level features. To solve this problem, we design an asymmetric decoder, namely A-Decoder. Firstly, A-Decoder still fully integrates the low-level features in the encoding stage to supplement the high-resolution information and recover more edge information of objects in medical images. However, instead of using skip connection for symmetrical structure directly, 3 &#xd7; 3 convolution is used after fusing low-level features, which effectively reduces redundant features. Then, it is fused with the high-level features from the context extraction module to refine the contour information of objects. Finally, A-Decoder apply a 1 &#xd7; 1 convolution to reduce the number of channels. A-Decoder is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>. In addition, A-Decoder can effectively reduce the fusion times of skip connections and supplement the same amount of spatial information with less computation. A-Decoder can be expressed as:<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:mtext>out</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mrow>
<mml:mtext>low</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mrow>
<mml:mtext>high</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>&#x2217;C</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mrow>
<mml:mtext>low</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mn>3</mml:mn>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mn>4</mml:mn>
</mml:msub>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>&#x2217;C</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mtext>&#x2217;C</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:msup>
<mml:msup>
<mml:mrow>
<mml:mtext>&#x2217;C</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="e3">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mrow>
<mml:mtext>high</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mtext>Up</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>Up</mml:mtext>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mn>5</mml:mn>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mtext>&#x2217;C</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>Where <inline-formula id="inf1">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf2">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf3">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mn>3</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf4">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mn>4</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> stand for the output of encoders in the different stages, respectively. <inline-formula id="inf5">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mtext>F</mml:mtext>
<mml:mn>5</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the output of semantic extraction module. <inline-formula id="inf6">
<mml:math id="m9">
<mml:mrow>
<mml:msup>
<mml:mtext>C</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>3</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf7">
<mml:math id="m10">
<mml:mrow>
<mml:msup>
<mml:mtext>C</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> stand for <inline-formula id="inf8">
<mml:math id="m11">
<mml:mrow>
<mml:mn>3</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf9">
<mml:math id="m12">
<mml:mrow>
<mml:mtext>&#xa0;</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> convolution, respectively. <inline-formula id="inf10">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>Up</mml:mtext>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf11">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>Up</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mn>16</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent 2x up-sampling and 16x up-sampling, respectively.</p>
</sec>
<sec id="s3-4">
<title>3.4 Loss Function</title>
<p>We utilize loss function to supervise the training process of AL-Net. Binary cross entropy is defined as a measure of the difference between two probability distributions for a given random variable or set of events. It is widely used in classification and segmentation tasks, and segmentation is a kind of pixel-level classification. Therefore, binary cross entropy loss function works well in the segmentation tasks. In addition, Dice coefficient is an ensemble similarity measure, which usually used to calculate the similarity of two samples. Dice coefficient can maximize the segmentation objects, thus preventing the learning process from falling into the local minimum. To effectively segment objects in medical images, AL-Net employs a composite loss function that combines Dice coefficient and binary cross entropy. The loss function employed by AL-Net is shown in <xref ref-type="disp-formula" rid="e4">Equation (4)</xref>:<disp-formula id="e4">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mtext>L</mml:mtext>
<mml:mrow>
<mml:mtext>loss</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mtext>L</mml:mtext>
<mml:mrow>
<mml:mtext>Dice</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mtext>L</mml:mtext>
<mml:mrow>
<mml:mtext>BCE</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>
<inline-formula id="inf12">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mtext>L</mml:mtext>
<mml:mrow>
<mml:mtext>BCE</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is defined in <xref ref-type="disp-formula" rid="e5">Eq. 5</xref>:<disp-formula id="e5">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mtext>L</mml:mtext>
<mml:mrow>
<mml:mtext>BCE</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mtext>n</mml:mtext>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mtext>i</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mtext>n</mml:mtext>
</mml:munderover>
<mml:mrow>
<mml:msub>
<mml:mtext>X</mml:mtext>
<mml:mtext>i</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:mstyle>
<mml:mi>log</mml:mi>
<mml:mtext>&#x3c3;</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mtext>Y</mml:mtext>
<mml:mtext>i</mml:mtext>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mtext>X</mml:mtext>
<mml:mtext>i</mml:mtext>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mi>log</mml:mi>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>&#x3c3;</mml:mtext>
<mml:mrow>
<mml:mo>(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mtext>Y</mml:mtext>
<mml:mtext>i</mml:mtext>
</mml:msub>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
<mml:mtext>&#xa0;&#xa0;</mml:mtext>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>Where, <inline-formula id="inf13">
<mml:math id="m18">
<mml:mtext>X</mml:mtext>
</mml:math>
</inline-formula> and <inline-formula id="inf14">
<mml:math id="m19">
<mml:mtext>Y</mml:mtext>
</mml:math>
</inline-formula> represent ground truth and prediction results, respectively; <inline-formula id="inf15">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mtext>X</mml:mtext>
<mml:mtext>i</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf16">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mtext>Y</mml:mtext>
<mml:mtext>i</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> stand for the ith element of <inline-formula id="inf17">
<mml:math id="m22">
<mml:mtext>X</mml:mtext>
</mml:math>
</inline-formula> and <inline-formula id="inf18">
<mml:math id="m23">
<mml:mtext>Y</mml:mtext>
</mml:math>
</inline-formula>, respectively. <inline-formula id="inf19">
<mml:math id="m24">
<mml:mtext>&#x3c3;</mml:mtext>
</mml:math>
</inline-formula> stands for the Sigmoid function. <inline-formula id="inf20">
<mml:math id="m25">
<mml:mtext>n</mml:mtext>
</mml:math>
</inline-formula> represents the total number of elements of <inline-formula id="inf21">
<mml:math id="m26">
<mml:mtext>X</mml:mtext>
</mml:math>
</inline-formula>. The Dice Loss is defined as:<disp-formula id="e6">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mtext>L</mml:mtext>
<mml:mrow>
<mml:mtext>Dice</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mfrac>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mtext>X</mml:mtext>
<mml:mo>&#x2229;</mml:mo>
<mml:mtext>Y</mml:mtext>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mtext>X</mml:mtext>
<mml:mo>&#x7c;</mml:mo>
<mml:mo>&#x2b;</mml:mo>
<mml:mo>&#x7c;</mml:mo>
<mml:mtext>Y</mml:mtext>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>Where, <inline-formula id="inf22">
<mml:math id="m28">
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mtext>X</mml:mtext>
<mml:mo>&#x2229;</mml:mo>
<mml:mtext>Y</mml:mtext>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is the element number of the intersection of <inline-formula id="inf23">
<mml:math id="m29">
<mml:mtext>X</mml:mtext>
</mml:math>
</inline-formula> and <inline-formula id="inf24">
<mml:math id="m30">
<mml:mtext>Y</mml:mtext>
</mml:math>
</inline-formula>. <inline-formula id="inf25">
<mml:math id="m31">
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mtext>X</mml:mtext>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf26">
<mml:math id="m32">
<mml:mrow>
<mml:mo>&#x7c;</mml:mo>
<mml:mtext>Y</mml:mtext>
<mml:mo>&#x7c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> represent the element numbers of <inline-formula id="inf27">
<mml:math id="m33">
<mml:mtext>X</mml:mtext>
</mml:math>
</inline-formula> and <inline-formula id="inf28">
<mml:math id="m34">
<mml:mtext>Y</mml:mtext>
</mml:math>
</inline-formula>, respectively.</p>
</sec>
<sec id="s3-5">
<title>3.5 Re-Parameterization</title>
<p>AL-Net is a convolutional neural network with multi-branch structure. This structure is beneficial to model training and improve the segmentation accuracy, but it will lead to a long inference time. In the inference stage, reparameterization technology can couple multiple branches into a single branch, which can speed up the inference procedure of AL-Net without sacrificing accuracy. Generally, the encoder has the greatest impact on the performance of image segmentation model. The encoder of AL-Net consists of three branches: identity branch, 1 &#xd7; 1 convolution and 3 &#xd7; 3 convolution. In the procedure of reparameterization, the identity branch can be regarded as a degenerative 1 &#xd7; 1 convolution and 1 &#xd7; 1 convolution can be regarded as a degenerative 3 &#xd7; 3 convolution. Thus, 3 &#xd7; 3 convolution, 1 &#xd7; 1 convolution and the batch standardization layer in the training model can be reconstructed into a 3 &#xd7; 3 convolution in the inference model. After reparameterization, the encoder of AL-Net becomes a single branch structure with only 3 &#xd7; 3 convolution layer. The procedure of reparameterization for each block is shown in <xref ref-type="fig" rid="F4">Figure 4</xref>. The reparameterized AL-Net can effectively reduce the amount of parameters and shorten the segmentation time.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>The procedure of reparameterization.</p>
</caption>
<graphic xlink:href="frsip-02-842925-g004.tif"/>
</fig>
</sec>
</sec>
<sec id="s4">
<title>4 Experiments</title>
<p>In this section, we first introduce the datasets, experimental setup and evaluation criteria. Then, the ablation studies for AL-Net are carried out on the cell contour segmentation datasets. Finally, AL-Net is compared with other state-of-the-art segmentation models in terms of parameters, segmentation speed and accuracy, and the results on three datasets of retinal vessels, cell contour and skin lesions are shown and discussed.</p>
<sec id="s4-1">
<title>4.1 Datasets</title>
<p>In order to evaluate the performance of AL-Net, we conducted segmentation experiments on three medical image datasets, which are shown in <xref ref-type="table" rid="T1">Table 1</xref>. These datasets are derived from the most common medical imaging modalities, including microscopy, dermatoscope and optical coherence tomography.<list list-type="simple">
<list-item>
<p>1) Retinal vessels. This dataset is a color fundus image dataset for retinal vessel segmentation (<xref ref-type="bibr" rid="B11">Hoover and Goldbaum, 2003</xref>), which includes ten lesion images and ten healthy images. The size of each image is 605 &#xd7; 700.</p>
</list-item>
<list-item>
<p>2) Cell contour. This dataset is obtained by transmission electron microscopy from the serial segment of the ventral nerve zone of <italic>Drosophila melanogaster</italic>, with a total of 30 images. Each image has complete cell and membrane labels, and the size of each image is 512 &#xd7; 512 (<xref ref-type="bibr" rid="B3">Cardona et al., 2010</xref>).</p>
</list-item>
<list-item>
<p>3) Skin lesions. This dataset is provided by ISIC 2018 (<xref ref-type="bibr" rid="B40">Tschandl et al., 2018</xref>); (<xref ref-type="bibr" rid="B1">Allan, 2019</xref>) for melanoma detection, including 2,594 images of skin lesion. The size of each image is 2,166 &#xd7; 3,188, which is resampled to 512 &#xd7; 512 in this experiment.</p>
</list-item>
</list>
</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Medical image segmentation datasets for the experiments.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Segmentation objects</th>
<th align="center">Images</th>
<th align="center">Input size</th>
<th align="center">Modality</th>
<th align="center">Provider</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Retinal vessels</td>
<td align="char" char=".">20</td>
<td align="char" char="&#xd7;">605 &#xd7; 700</td>
<td align="left">OCT</td>
<td align="left">STARE</td>
</tr>
<tr>
<td align="left">Cell contour</td>
<td align="char" char=".">30</td>
<td align="char" char="&#xd7;">512 &#xd7; 512</td>
<td align="left">Microscopy</td>
<td align="left">ISBI 2012</td>
</tr>
<tr>
<td align="left">Skin lesions</td>
<td align="char" char=".">2,594</td>
<td align="char" char="&#xd7;">512 &#xd7; 512</td>
<td align="left">Dermatoscope</td>
<td align="left">ISIC 2018</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Two steps of data augmentation are carried out for these datasets to avoid the risk of over fitting caused by little data. Firstly, each image is expanded to eight times of the original image by turning horizontally, vertically and diagonally, respectively. Then, each image is translated up, down, left and right, respectively. The above data augmentation methods cannot change the data distribution, meanwhile can avoid over-fitting in the training process and effectively improve the generalization ability of the model.</p>
<p>The division of training dataset and test dataset of cell contour and skin lesions segmentation dataset is consistent with the official description. Retinal vessel segmentation dataset is randomly divided into the training dataset and test dataset according to 7:3. The training dataset and test dataset are separately expanded using the data augmentation methods described above. Then, the validation dataset is divided from the training dataset, and the proportion of training dataset, validation dataset and test dataset is 5:2:3. In the experiment, the training dataset is used for model training, the test dataset is used to evaluate the model, and the validation dataset is used to evaluate the model performance in the process of model training to obtain the best model.</p>
</sec>
<sec id="s4-2">
<title>4.2 Experimental Setup</title>
<p>All models involved in the experiment are implemented on the cloud service platform, which is equipped with NVIDIA Tesla P100 GPU with 16&#xa0;GB memory. PyTorch is chosen as the framework of deep learning. In AL-Net, Adam is used as the optimizer. The initial learning rate is set to 5e-3 and the batch size is set to 4. In addition, we use the automatic attenuation strategy of the learning rate, where the step size is 1 and the attenuation factor <inline-formula id="inf29">
<mml:math id="m35">
<mml:mtext>&#x3b3;</mml:mtext>
</mml:math>
</inline-formula> is 0.95. The maximum epoch is set to 150 in all experiments, and the training processes of all models are terminated when epoch is 150.</p>
</sec>
<sec id="s4-3">
<title>4.3 Evaluation Measures</title>
<p>We evaluate the performance of AL-Net from three aspects: model complexity, inference speed and segmentation accuracy. The model complexity is measured by the parameters of the model, and the inference speed is evaluated by the inference time of the model for a single image. For the sake of fairness, the inference time is the average time of performing ten segmentation processes for each sample after hardware preheating. We choose the accuracy (Acc) and intersection union ratio (IoU) to evaluate the segmentation accuracy of all models.</p>
<p>Accuracy refers to the ratio of object results in all prediction results, which is shown in <xref ref-type="disp-formula" rid="e7">Eq. 7</xref>:<disp-formula id="e7">
<mml:math id="m36">
<mml:mrow>
<mml:mtext>Acc</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>TN</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>TN</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>IoU represents the similarity or overlap between the predicted object and the ground truth, which is computed as follows:<disp-formula id="e8">
<mml:math id="m37">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>Where <inline-formula id="inf30">
<mml:math id="m38">
<mml:mrow>
<mml:mtext>TP</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf31">
<mml:math id="m39">
<mml:mrow>
<mml:mtext>TN</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf32">
<mml:math id="m40">
<mml:mrow>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf33">
<mml:math id="m41">
<mml:mrow>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> represent the number of true positive, true negative, false positive and false negative, respectively. The value range of <inline-formula id="inf34">
<mml:math id="m42">
<mml:mrow>
<mml:mtext>Acc</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf35">
<mml:math id="m43">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> is [0, 1]. The closer the values of <inline-formula id="inf36">
<mml:math id="m44">
<mml:mrow>
<mml:mtext>Acc</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf37">
<mml:math id="m45">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> are to 1, the better the segmentation result.</p>
</sec>
<sec id="s4-4">
<title>4.4 Ablation Study</title>
<p>In this paper, we design a lightweight segmentation network for medical images on the premise of ensuring the segmentation accuracy, which mainly includes three contributions. First, RepVGG-A1, which realizes lightweight design using the re-parameterization technology, is selected as the backbone of AL-Net. Secondly, we integrate LR-ASPP which is a lightweight context information extraction module into AL-Net. Thirdly, we design a decoder with asymmetric structure in term of the characters of medical images. To validate the efficiency of three contributions, we conduct the ablation study on the test dataset of cell contour images.</p>
<sec id="s4-4-1">
<title>4.4.1 Ablation Study for RepVGG-A1</title>
<p>Encoder plays an important role in extracting features for the image segmentation model. High performance encoder is of great significance to image segmentation model. Since we choose RepVGG-A1 as the encoder of AL-Net, we compare it with Res-Net34 to validate the performance of RepVGG-A1. RepVGG-A1 and Res-Net34 are respectively used as encoders of AL-Net. The results of segmentation accuracy and speed are shown in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>The results of ablation study for RepVGG-A1.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Encoder</th>
<th align="center">
<inline-formula id="inf38">
<mml:math id="m46">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> (mean &#xb1; std)</th>
<th align="center">Params (M)</th>
<th align="center">Time (ms)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">RepVGG-A1</td>
<td align="char" char="plusmn">
<bold>0.8963 &#xb1; 0.0142</bold>
</td>
<td align="char" char=".">
<bold>3.45</bold>
</td>
<td align="char" char=".">
<bold>34.3</bold>
</td>
</tr>
<tr>
<td align="left">Res-Net34</td>
<td align="char" char="plusmn">0.8960 &#xb1; 0.0147</td>
<td align="char" char=".">21.51</td>
<td align="char" char=".">40.8</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold represents the best result.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In <xref ref-type="table" rid="T2">Table 2</xref>, the IoU value of these models is similar, but the parameters of AL-Net using RepVGG-A1 in the inference stage is 3.45&#xa0;M, which is only 1/6 of AL-Net using Res-Net34. Moreover, the inference time of AL-Net using RepVGG-A1 is 34.3&#xa0;ms, which is faster than that of AL-Net using Res-Net34. To sum up, compared with Res-Net34, our encoder has higher accuracy, faster speed and smaller parameters.</p>
</sec>
<sec id="s4-4-2">
<title>4.4.2 Ablation Study for LR-ASPP Block</title>
<p>The context extraction module is an important component to enhance the ability of the feature representation of the model. We employed LR-ASPP as the semantic feature extractor of AL-Net and compared it with the classical context extraction module, such as ASPP. The results are shown in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>The results of ablation study for LR-ASPP.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Context extractor</th>
<th align="center">
<inline-formula id="inf39">
<mml:math id="m47">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> (mean &#xb1; std)</th>
<th align="center">Parameters (M)</th>
<th align="center">Time (ms)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">LR-ASPP</td>
<td align="char" char="plusmn">
<bold>0.8963 &#xb1; 0.0142</bold>
</td>
<td align="char" char=".">
<bold>3.45</bold>
</td>
<td align="char" char=".">
<bold>34.3</bold>
</td>
</tr>
<tr>
<td align="left">ASPP</td>
<td align="char" char="plusmn">0.8941 &#xb1; 0.0160</td>
<td align="char" char=".">5.39</td>
<td align="char" char=".">42.4</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold represents the best result.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In <xref ref-type="table" rid="T3">Table 3</xref>, the parameters of AL-Net using LR-ASPP is only 3.45&#xa0;M, which is nearly 1/3 less than that of AL-Net using ASPP. Compared to AL-Net using ASPP, the inference time for each image is also shortened from 42.4 to 34.3&#xa0;ms, and the segmentation accuracy is effectively improved. The main reason is that LR-ASPP has fewer branches and combines the attention mechanism, which is more suitable for medical images than ASPP.</p>
</sec>
<sec id="s4-4-3">
<title>4.4.3 Ablation Study for A-Decoder</title>
<p>In order to improve the performance of AL-Net, we also designed an asymmetric decoder A-Decoder, which is compared with the popular symmetric U-Decoder. The results are shown in <xref ref-type="table" rid="T4">Table 4</xref>.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>The results of ablation study for A-Decoder.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Decoder</th>
<th align="center">
<inline-formula id="inf40">
<mml:math id="m48">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> (mean &#xb1; std)</th>
<th align="center">Parameters (M)</th>
<th align="center">Time (ms)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">A-Decoder</td>
<td align="char" char="plusmn">
<bold>0.8963 &#xb1; 0.0142</bold>
</td>
<td align="char" char=".">
<bold>3.45</bold>
</td>
<td align="char" char=".">
<bold>34.3</bold>
</td>
</tr>
<tr>
<td align="left">U-Decoder</td>
<td align="char" char="plusmn">0.8805 &#xb1; 0.0169</td>
<td align="char" char=".">3.45</td>
<td align="char" char=".">41.1</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold represents the best result.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>It can be seen from <xref ref-type="table" rid="T4">Table 4</xref> that the inference speed of the AL-Net with A-Decoder is 34.3&#xa0;ms, which is 16.5% faster than that of the model with U-Decoder, and the IoU is increased from 0.8805 to 0.8963. This is due to the fact that A-Decoder not only retains sufficient low-level semantic information with less feature fusion, but also skillfully employs 3 &#xd7; 3 convolution and refines low-level features to improve performance of the model.</p>
</sec>
</sec>
<sec id="s4-5">
<title>4.5 Comparison With Other Methods</title>
<p>In this section, we compare AL-Net with three semantic segmentation models with excellent performance in recent years (DeepLabv3&#x2b;, CE-Net, PyConvU-Net and U-Net&#x2b;&#x2b;) and its baseline (replacing the encoder of U-Net with RepVGG-A1) in terms of speed and accuracy. We analyze the parameters and speed of these models, and report the experimental results in terms of their accuracy.</p>
<sec id="s4-5-1">
<title>4.5.1 Inference Speed</title>
<p>Inference speed is an important basis for evaluating the performance of medical image segmentation model, especially in practical application. We use the above six models to test the inference speed on the datasets of retinal vessel, cell contour and skin lesions segmentation, respectively. For the sake of fairness, all experiments are conducted in the same environment, and we record the average inference time of ten executions. The experiment results are shown in <xref ref-type="table" rid="T5">Table 5</xref>.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>The evaluation of parameters and inference time of six models.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Models</th>
<th rowspan="2" align="center">Parameters (M)</th>
<th colspan="3" align="center">Inference time (ms)</th>
</tr>
<tr>
<th align="center">Cell contour</th>
<th align="center">Skin lesions</th>
<th align="center">Retinal vessels</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">DeepLabv3&#x2b;</td>
<td align="char" char=".">59.34</td>
<td align="char" char=".">50.6</td>
<td align="char" char=".">51.2</td>
<td align="char" char=".">53.1</td>
</tr>
<tr>
<td align="left">CE-Net</td>
<td align="char" char=".">28.99</td>
<td align="char" char=".">41.5</td>
<td align="char" char=".">42.1</td>
<td align="char" char=".">42.4</td>
</tr>
<tr>
<td align="left">U-Net&#x2b;&#x2b;</td>
<td align="char" char=".">9.16</td>
<td align="char" char=".">1,530.8</td>
<td align="char" char=".">1,542.3</td>
<td align="char" char=".">1910.4</td>
</tr>
<tr>
<td align="left">PyConvU-Net</td>
<td align="char" char=".">3.7</td>
<td align="char" char=".">45.7</td>
<td align="char" char=".">47.2</td>
<td align="char" char=".">50.4</td>
</tr>
<tr>
<td align="left">Baseline</td>
<td align="char" char=".">3.45</td>
<td align="char" char=".">38.7</td>
<td align="char" char=".">38.5</td>
<td align="char" char=".">39.5</td>
</tr>
<tr>
<td align="left">AL-Net</td>
<td align="char" char=".">
<bold>3.45</bold>
</td>
<td align="char" char=".">
<bold>34.3</bold>
</td>
<td align="char" char=".">
<bold>34.6</bold>
</td>
<td align="char" char=".">
<bold>36.2</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold represents the best result.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In <xref ref-type="table" rid="T5">Table 5</xref>, the parameters of AL-Net is only 3.45&#xa0;M, which is equivalent to baseline and much smaller than other models. The parameters of AL-Net is only 1/25 of that of DeepLabv3&#x2b;, which is 5.71&#xa0;M less than that of U-Net&#x2b;&#x2b;. In addition, the inference time of AL-Net for input images with different sizes is significantly shorter than that of other models. For the input image with size 512 &#xd7; 512 on the cell contour dataset, the inference time of AL-Net is 34.3&#xa0;ms, which is 44.6 times shorter than U-Net&#x2b;&#x2b;. For the skin lesions dataset, the inference time of AL-Net is 34.6&#xa0;ms, which is 1.5 times faster than DeepLabv3&#x2b;. For each image with size 700 &#xd7; 605 on the retinal vessel dataset, the inference time of AL-Net is 36.2&#xa0;ms, which is shorter than other models. Because U-Net&#x2b;&#x2b; has only 9.16&#xa0;M parameters but many branches, which reduces the parallelism of the model, the inference time of U-Net&#x2b;&#x2b; is much longer than other models. However, AL-Net achieves the high inference speed due to the single branch structure in the inference stage. AL-Net is also faster than DeepLabv3&#x2b; and CE-Net. There are two main reasons. One is that the decoder of AL-Net has a simple single branch structure in the inference stage, and the other is that the encoder of AL-Net has fewer fusion operations, which saves a lot of time.</p>
</sec>
<sec id="s4-5-2">
<title>4.5.2 Accuracy Analysis</title>
<p>In this section, we will show some visualization examples and experimental results of segmentation accuracy on retinal vessels, cell contours and skin injury segmentation datasets.</p>
<p>
<italic>Retinal vessels:</italic> the segmentation results on the retinal vessel dataset are shown in <xref ref-type="fig" rid="F5">Figure 5</xref>. In <xref ref-type="fig" rid="F5">Figure 5</xref>, the segmentation results of AL-Net are closest to the ground truth, and there are inaccurate segmentation boundaries in the segmentation results of other models. For example, the segmentation results of DeepLabv3&#x2b; are the coarsest and cannot interpret the details of retinal vessels. U-Net&#x2b;&#x2b; cannot completely segment the ends of blood vessels. Baseline and CE-Net lead to over segmentation and incorrectly segment objects from background. The accuracy evaluation results of the above six models on the retinal vessels dataset are shown in <xref ref-type="table" rid="T6">Table 6</xref>. The Acc and IoU of AL-Net is 0.9728 and 0.6851, respectively, which is better than other models. In conclusion, the performance of AL-Net for retinal vessels segmentation is significantly better than other models.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>The segmentation results of five models for retinal vessels images.</p>
</caption>
<graphic xlink:href="frsip-02-842925-g005.tif"/>
</fig>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>The accuracy evaluation of six models on the retinal vessels dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Models</th>
<th align="center">
<inline-formula id="inf41">
<mml:math id="m49">
<mml:mrow>
<mml:mtext>Acc</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> (mean &#xb1; std)</th>
<th align="center">
<inline-formula id="inf42">
<mml:math id="m50">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> (mean &#xb1; std)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">DeepLabv3&#x2b;</td>
<td align="char" char="plusmn">0.9710 &#xb1; 0.0060</td>
<td align="char" char="plusmn">0.6704 &#xb1; 0.0474</td>
</tr>
<tr>
<td align="left">CE-Net</td>
<td align="char" char="plusmn">0.9649 &#xb1; 0.0056</td>
<td align="char" char="plusmn">0.6374 &#xb1; 0.0404</td>
</tr>
<tr>
<td align="left">U-Net&#x2b;&#x2b;</td>
<td align="char" char="plusmn">0.9716 &#xb1; 0.0099</td>
<td align="char" char="plusmn">0.6614 &#xb1; 0.0841</td>
</tr>
<tr>
<td align="left">PyConvU-Net</td>
<td align="char" char="plusmn">0.9130 &#xb1; 0.0085</td>
<td align="char" char="plusmn">0.6031 &#xb1; 0.0752</td>
</tr>
<tr>
<td align="left">Baseline</td>
<td align="char" char="plusmn">0.9673 &#xb1; 0.0067</td>
<td align="char" char="plusmn">0.6525 &#xb1; 0.0412</td>
</tr>
<tr>
<td align="left">AL-Net</td>
<td align="char" char="plusmn">
<bold>0.9728 &#xb1; 0.0065</bold>
</td>
<td align="char" char="plusmn">
<bold>0.6851 &#xb1; 0.0577</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold represents the best result.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In the retinal vessels dataset, AL-Net first benefits from the semantic extraction module, which combines the channel attention module, so that AL-Net can not only capture the high-level context information, but also optimize the channel dimension. Secondly, compared with other models, the advantage of AL-Net comes from the decoder. There is only one layer of low-level information combined with DeepLabv3&#x2b;, which is far from enough for medical images. Baseline, U-Net&#x2b;&#x2b;, PyConvU-Net and CE-Net simply transmit low-level features to the decoder through skip connection. However, the decoder of AL-Net integrates low-level features and applies 3 &#xd7; 3 convolution to refine the features, which makes AL-Net more suitable for segmenting small objects and plays a gain effect on segmenting retinal vessels.</p>
<p>
<italic>Cell contour:</italic> <xref ref-type="fig" rid="F6">Figure 6</xref> shows the segmentation results of six models on the cell contour segmentation dataset. In <xref ref-type="fig" rid="F6">Figure 6</xref>, the segmentation results of AL-Net are more consistent with the ground truth, and the segmentation results of other models are discontinuous at the foreground edge. In addition, the segmentation results of U-Net&#x2b;&#x2b; are also disturbed by complex noise. The accuracy evaluation of these models is shown in <xref ref-type="table" rid="T7">Table 7</xref>. In <xref ref-type="table" rid="T7">Table 7</xref>, the value of <inline-formula id="inf43">
<mml:math id="m51">
<mml:mrow>
<mml:mtext>Acc</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf44">
<mml:math id="m52">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> of AL-Net is 0.9406 and 0.8963, respectively, which are better than other models. Meanwhile, the standard deviation of AL-Net is also smaller than other models. To sum up, AL-Net can improve effectively the accuracy of segmentation results, which is suitable for extracting cell contour.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>The segmentation results of five models for cell contour images. The red and yellow boxes respectively represent the segmentation differences of five models in the same position.</p>
</caption>
<graphic xlink:href="frsip-02-842925-g006.tif"/>
</fig>
<table-wrap id="T7" position="float">
<label>TABLE 7</label>
<caption>
<p>The performance evaluation of six models for cell contour segmentation.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Models</th>
<th align="center">
<inline-formula id="inf45">
<mml:math id="m53">
<mml:mrow>
<mml:mtext>Acc</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> (mean &#xb1; std)</th>
<th align="center">
<inline-formula id="inf46">
<mml:math id="m54">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> (mean &#xb1; std)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">DeepLabv3&#x2b;</td>
<td align="char" char="plusmn">0.9315 &#xb1; 0.0165</td>
<td align="char" char="plusmn">0.8793 &#xb1; 0.0137</td>
</tr>
<tr>
<td align="left">CE-Net</td>
<td align="char" char="plusmn">0.9144 &#xb1; 0.0176</td>
<td align="char" char="plusmn">0.8587 &#xb1; 0.0188</td>
</tr>
<tr>
<td align="left">U-Net&#x2b;&#x2b;</td>
<td align="char" char="plusmn">0.9219 &#xb1; 0.0186</td>
<td align="char" char="plusmn">0.8661 &#xb1; 0.0156</td>
</tr>
<tr>
<td align="left">PyConvU-Net</td>
<td align="char" char="plusmn">0.9124 &#xb1; 0.0146</td>
<td align="char" char="plusmn">0.8563 &#xb1; 0.0162</td>
</tr>
<tr>
<td align="left">Baseline</td>
<td align="char" char="plusmn">0.9350 &#xb1; 0.0161</td>
<td align="char" char="plusmn">0.8789 &#xb1; 0.0151</td>
</tr>
<tr>
<td align="left">AL-Net</td>
<td align="char" char="plusmn">
<bold>0.9406 &#xb1; 0.0148</bold>
</td>
<td align="char" char="plusmn">
<bold>0.8963 &#xb1; 0.0142</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold represents the best result.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>
<italic>Skin lesions:</italic> The visualization and accuracy evaluation of segmentation results on the skin lesions dataset using six models are shown in <xref ref-type="fig" rid="F7">Figure 7</xref> and <xref ref-type="table" rid="T8">Table 8</xref>, respectively. In <xref ref-type="fig" rid="F7">Figure 7</xref>, compared with other models, the segmentation results of AL-Net are obviously closer to the ground truth. In <xref ref-type="table" rid="T8">Table 8</xref>, the <inline-formula id="inf47">
<mml:math id="m55">
<mml:mrow>
<mml:mtext>Acc</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf48">
<mml:math id="m56">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> of AL-Net are 0.9312 and 0.7947, respectively, which is significantly improved compared with other models. Therefore, AL-Net outperforms other state-of-the-art models for skin lesions image segmentation.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>The segmentation results using six models for skin lesions images.</p>
</caption>
<graphic xlink:href="frsip-02-842925-g007.tif"/>
</fig>
<table-wrap id="T8" position="float">
<label>TABLE 8</label>
<caption>
<p>Performance evaluation of six models on the skin lesions dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Models</th>
<th align="center">
<inline-formula id="inf49">
<mml:math id="m57">
<mml:mrow>
<mml:mtext>Acc</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> (mean &#xb1; std)</th>
<th align="center">
<inline-formula id="inf50">
<mml:math id="m58">
<mml:mrow>
<mml:mtext>IoU</mml:mtext>
</mml:mrow>
</mml:math>
</inline-formula> (mean &#xb1; std)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">DeepLabv3&#x2b;</td>
<td align="char" char="plusmn">0.9155 &#xb1; 0.1041</td>
<td align="char" char="plusmn">0.7677 &#xb1; 0.1520</td>
</tr>
<tr>
<td align="left">CE-Net</td>
<td align="char" char="plusmn">0.9305 &#xb1; 0.0711</td>
<td align="char" char="plusmn">0.7760 &#xb1; 0.1371</td>
</tr>
<tr>
<td align="left">U-Net&#x2b;&#x2b;</td>
<td align="char" char="plusmn">0.9058 &#xb1; 0.1073</td>
<td align="char" char="plusmn">0.7255 &#xb1; 0.2011</td>
</tr>
<tr>
<td align="left">PyConvU-Net</td>
<td align="char" char="plusmn">0.8847 &#xb1; 0.1650</td>
<td align="char" char="plusmn">0.6941 &#xb1; 0.1573</td>
</tr>
<tr>
<td align="left">Baseline</td>
<td align="char" char="plusmn">0.9310 &#xb1; 0.0946</td>
<td align="char" char="plusmn">0.7935 &#xb1; 0.1657</td>
</tr>
<tr>
<td align="left">AL-Net</td>
<td align="char" char="plusmn">
<bold>0.9312 &#xb1; 0.0890</bold>
</td>
<td align="char" char="plusmn">
<bold>0.7947 &#xb1; 0.1533</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Bold represents the best result.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>
<xref ref-type="fig" rid="F8">Figure 8</xref> shows the evaluation of the inference speed and accuracy of the six models for three different datasets. As can be seen from <xref ref-type="fig" rid="F8">Figure 8</xref>, AL-Net has the faster inference speed and the higher performance than the other models for the three datasets, which more intuitively proves the efficiency and effectiveness of AL-Net.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Speed-accuracy trade-off comparison of six models on three test datasets.</p>
</caption>
<graphic xlink:href="frsip-02-842925-g008.tif"/>
</fig>
</sec>
</sec>
</sec>
<sec id="s5">
<title>5 Conclusion</title>
<p>Aiming at the problems of large parameters and slow inference speed of medical image segmentation model, an asymmetric lightweight semantic segmentation network AL-Net is proposed in this paper. The encoder of AL-Net is trained through multi-branch structure to extract powerful medical image features. The context extraction module of AL-Net captures the context features and recalibrates the feature response in the channel direction by explicitly modeling the interdependence between channels, which is more suitable for segmenting medical images. The decoder of AL-Net not only makes full use of the low-level semantic information, but also combines 3 &#xd7; 3 convolution to effectively eliminate redundant features. Finally, the reparameterization technology simplifies the inference procedure of AL-Net and improves the inference speed of AL-Net. The total parameters of AL-Net are only 3.45&#xa0;M. Meanwhile, compared with the state-of-the-art models, AL-Net has achieved the best accuracy and the fastest speed on three datasets of retinal vessel, cell contour and skin lesions.</p>
</sec>
</body>
<back>
<sec id="s6">
<title>Data Availability Statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: STARE: <ext-link ext-link-type="uri" xlink:href="http://cecas.clemson.edu/%7Eahoover/stare/">http://cecas.clemson.edu/&#x223c;ahoover/stare/</ext-link>. ISIB2012: <ext-link ext-link-type="uri" xlink:href="http://brainiac2.mit.edu/isbi_challenge/">http://brainiac2.mit.edu/isbi_challenge/</ext-link>. ISIC2018: <ext-link ext-link-type="uri" xlink:href="https://challenge.isic-archive.com/data/">https://challenge.isic-archive.com/data/</ext-link>.</p>
</sec>
<sec id="s7">
<title>Author Contributions</title>
<p>YN and XD put forward the innovative ideas of the article, FW and TL designed and completed some experiments, YN and XD wrote the article, SW and XZ made important revisions to the article.</p>
</sec>
<sec id="s8">
<title>Funding</title>
<p>This work is partly supported by National Natural Science Foundation of China under Grant Nos. 61861024, 61871259, and 61762058, Natural Science Foundation of Gansu Province of China under Grant No.20JR5RA404, Natural Science Basic Research Program of Shaanxi(Program No.2021JC-47), Key Research and Development Program of Shaanxi (Program No. 2021ZDLGY08-07), and Shaanxi Joint Laboratory of Artificial Intelligence (Program No. 2020SS-03).</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of Interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Allan</surname>
<given-names>C.,</given-names>
</name>
</person-group> <article-title>Halpern et al. &#x201c;Skin lesion Analysis Toward Melanoma Detection 2018: A Challenge Hosted by the International Skin Imaging Collaboration (ISIC)</article-title>,&#x201d; <comment>arXiv:1902.03368</comment>, <year>2019</year>. </citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alom</surname>
<given-names>M. Z.</given-names>
</name>
<name>
<surname>Yakopcic</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Hasan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Taha</surname>
<given-names>T. M.</given-names>
</name>
<name>
<surname>Asari</surname>
<given-names>V. K.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Recurrent Residual U-Net for Medical Image Segmentation</article-title>. <source>J. Med. Imaging (Bellingham)</source> <volume>6</volume> (<issue>1</issue>), <fpage>014006</fpage>. <pub-id pub-id-type="doi">10.1117/1.JMI.6.1.014006</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cardona</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Saalfeld</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Preibisch</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Schmid</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Pulokas</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2010</year>). <article-title>An Integrated Micro- and Macroarchitectural Analysis of the Drosophila Brain by Computer-Assisted Serial Section Electron Microscopy</article-title>. <source>Plos Biol.</source> <volume>8</volume> (<issue>10</issue>). <pub-id pub-id-type="doi">10.1371/journal.pbio.1000502</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>3D Dilated Multi-Fiber Network for Real-Time Brain Tumor Segmentation in MRI</article-title>,&#x201d; in <conf-name>22nd International Conference on Medical Image Computing and Computer-Assisted Intervention</conf-name>, <fpage>184</fpage>&#x2013;<lpage>192</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-32248-9_21</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>L.-C.</given-names>
</name>
<name>
<surname>Papandreou</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Kokkinos</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Murphy</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Yuille</surname>
<given-names>A. L.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>40</volume> (<issue>4</issue>), <fpage>834</fpage>&#x2013;<lpage>848</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2017.2699184</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>L. C.</given-names>
</name>
<name>
<surname>Papandreou</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Schroff</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Adam</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Rethinking Atrous Convolution for Semantic Image Segmentation</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>. <comment>arXiv:1706.05587</comment>. </citation>
</ref>
<ref id="B7">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chollet</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Xception: Deep Learning with Depthwise Separable Convolutions</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <fpage>1251</fpage>&#x2013;<lpage>1258</lpage>. <pub-id pub-id-type="doi">10.1109/cvpr.2017.195</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>Y.</given-names>
</name>
</person-group>, &#x201c;<article-title>TransUNet: Transformers Make Strong Encoders for Medical Image Segmentation</article-title>,&#x201d; <comment>arXiv:2102.04306</comment>, <year>2021</year>. </citation>
</ref>
<ref id="B8">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>RepVGG: Making VGG-Style ConvNets Great Again</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <fpage>13733</fpage>&#x2013;<lpage>13742</lpage>. <pub-id pub-id-type="doi">10.1109/cvpr46437.2021.01352</pub-id> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>CE-net: Context Encoder Network for 2D Medical Image Segmentation</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>38</volume>, <fpage>2281</fpage>&#x2013;<lpage>2292</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2019.2903562</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep Residual Learning for Image Recognition</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <fpage>770</fpage>&#x2013;<lpage>778</lpage>. </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hoover</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Goldbaum</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2003</year>). <article-title>Locating the Optic Nerve in a Retinal Image Using the Fuzzy Convergence of the Blood Vessels</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>22</volume> (<issue>8</issue>), <fpage>951</fpage>&#x2013;<lpage>958</lpage>. <pub-id pub-id-type="doi">10.1109/tmi.2003.815900</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Howard</surname>
<given-names>A. G.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kalenichenko</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Weyand</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). &#x201c;<article-title>MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>. <comment>arXiv:1704.04861</comment>. </citation>
</ref>
<ref id="B13">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Howard</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sandler</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L. C.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>Searching for MobileNetV3</article-title>,&#x201d; in <conf-name>IEEE International Conference on Computer Vision</conf-name>, <fpage>1314</fpage>&#x2013;<lpage>1324</lpage>. </citation>
</ref>
<ref id="B14">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Laurens</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Weinberger</surname>
<given-names>K. Q.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>Densely Connected Convolutional Networks</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <fpage>2261</fpage>&#x2013;<lpage>2269</lpage>. </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ibtehaz</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Rahman</surname>
<given-names>M. S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>MultiResUNet : Rethinking the U-Net Architecture for Multimodal Biomedical Image Segmentation</article-title>. <source>Neural Networks</source> <volume>121</volume>, <fpage>74</fpage>&#x2013;<lpage>87</lpage>. <pub-id pub-id-type="doi">10.1016/j.neunet.2019.08.025</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Isensee</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Jaeger</surname>
<given-names>P. F.</given-names>
</name>
<name>
<surname>Kohl</surname>
<given-names>S. A. A.</given-names>
</name>
<name>
<surname>Petersen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Maier-Hein</surname>
<given-names>K. H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>nnU-Net: a Self-Configuring Method for Deep Learning-Based Biomedical Image Segmentation</article-title>. <source>Nat. Methods</source> <volume>18</volume>, <fpage>203</fpage>&#x2013;<lpage>211</lpage>. <pub-id pub-id-type="doi">10.1038/s41592-020-01008-z</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Jie</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Gang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Albanie</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Squeeze-and-Excitation Networks</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <fpage>7132</fpage>&#x2013;<lpage>7141</lpage>. </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jns</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Si</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Mhy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Halim Yulius</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Kien Yee</surname>
<given-names>Y. E.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Incorporating Convolutional Neural Networks and Sequence Graph Transform for Identifying Multilabel Protein Lysine Ptm Sites</article-title>. <source>Chemometrics Intell. Lab. Syst.</source> <volume>206</volume>, <fpage>104171</fpage>. <pub-id pub-id-type="doi">10.1016/j.chemolab.2020.104171</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Le</surname>
<given-names>N. Q. K.</given-names>
</name>
<name>
<surname>Ho</surname>
<given-names>Q.-T.</given-names>
</name>
<name>
<surname>Yapp</surname>
<given-names>E. K. Y.</given-names>
</name>
<name>
<surname>Ou</surname>
<given-names>Y.-Y.</given-names>
</name>
<name>
<surname>Yeh</surname>
<given-names>H.-Y.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>DeepETC: A Deep Convolutional Neural Network Architecture for Investigating and Classifying Electron Transport Chain&#x27;s Complexes</article-title>. <source>Neurocomputing</source> <volume>375</volume>, <fpage>71</fpage>&#x2013;<lpage>79</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2019.09.070</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Le</surname>
<given-names>N. Q. K.</given-names>
</name>
<name>
<surname>Yapp</surname>
<given-names>E. K. Y.</given-names>
</name>
<name>
<surname>Nagasundaram</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Yeh</surname>
<given-names>H.-Y.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Classifying Promoters by Interpreting the Hidden Information of DNA Sequences via Deep Learning and Combination of Continuous FastText N-Grams</article-title>. <source>Front. Bioeng. Biotechnol.</source> <volume>7</volume>, <fpage>305</fpage>. <pub-id pub-id-type="doi">10.3389/fbioe.2019.00305</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lei</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Nandi</surname>
<given-names>A. K.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Lightweight V-Net for Liver Segmentation</article-title>,&#x201d; in <conf-name>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing</conf-name> (<publisher-loc>Virtual Barcelona</publisher-loc>: <publisher-name>ICASSP</publisher-name>), <fpage>1379</fpage>&#x2013;<lpage>1383</lpage>. <pub-id pub-id-type="doi">10.1109/ICASSP40776.2020.9053454</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>PyConvU-Net: PyConvU-Net: a Lightweight and Multiscale Network for Biomedical Image Segmentation</article-title>. <source>BMC Bioinformatics</source> <volume>22</volume> (<issue>14</issue>). <pub-id pub-id-type="doi">10.1186/s12859-020-03943-2</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Connection Sensitive Attention U-NET for Accurate Retinal Vessel Segmentation</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>. <comment>arXiv:1903.05558</comment>. </citation>
</ref>
<ref id="B26">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Milan</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Reid</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>RefineNet: Multi-Path Refinement Networks for High-Resolution Semantic Segmentation</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <fpage>1925</fpage>&#x2013;<lpage>1934</lpage>. <pub-id pub-id-type="doi">10.1109/cvpr.2017.549</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Rabinovich</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Berg</surname>
<given-names>A. C.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>ParseNet: Looking Wider to See Better</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>. <comment>arXiv:1506.04579</comment>. </citation>
</ref>
<ref id="B28">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>H. T.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design</article-title>,&#x201d; in <conf-name>Proceedings of the European Conference on Computer Vision</conf-name>, <fpage>116</fpage>&#x2013;<lpage>131</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-01264-9_8</pub-id> </citation>
</ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Nekrasov</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Reid</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Light-Weight RefineNet for Real-Time Semantic Segmentation</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>. <comment>arXiv:1810.03272</comment>. </citation>
</ref>
<ref id="B30">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ni</surname>
<given-names>Z.-L.</given-names>
</name>
<name>
<surname>Bian</surname>
<given-names>G.-B.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>X.-H.</given-names>
</name>
<name>
<surname>Hou</surname>
<given-names>Z.-G.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>X.-L.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). &#x201c;<article-title>RAUNet: Residual Attention U-Net for Semantic Segmentation of Cataract Surgical Instruments</article-title>,&#x201d; in <conf-name>26th International Conference on Neural Information Processing</conf-name>, <fpage>139</fpage>&#x2013;<lpage>149</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-36711-4_13</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ronneberger</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Fischer</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Brox</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>U-net: Convolutional Networks for Biomedical Image Segmentation</article-title>,&#x201d; in <conf-name>International Conference on Medical Image Computing and Computer-Assisted Intervention</conf-name>, <fpage>234</fpage>&#x2013;<lpage>241</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-24574-4_28</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sandler</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Howard</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhmoginov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L. C.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>MobileNetV2: Inverted Residuals and Linear Bottlenecks</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <fpage>4510</fpage>&#x2013;<lpage>4520</lpage>. </citation>
</ref>
<ref id="B33">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sergey Ioffe </surname>
</name>
<name>
<surname>Szegedy</surname>
<given-names>Christian.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Batch normalization:Accelerating Deep Network Training by Reducing Internal Covariate Shift</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning</conf-name>, <fpage>448</fpage>&#x2013;<lpage>456</lpage>. </citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shelhamer</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Long</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Darrell</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Fully Convolutional Networks for Semantic Segmentation</article-title>. <source>IEEE Trans. Pattern Anal. Mach Intell.</source> <volume>39</volume> (<issue>4</issue>), <fpage>640</fpage>&#x2013;<lpage>651</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2016.2572683</pub-id> </citation>
</ref>
<ref id="B35">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Simonyan</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zisserman</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2014</year>). &#x201c;<article-title>Very Deep Convolutional Networks for Large-Scale Image Recognition</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>. <comment>arXiv:1409.1556</comment>. </citation>
</ref>
<ref id="B36">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Szegedy</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ioffe</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Vanhoucke</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Alemi</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning</article-title>,&#x201d; in <conf-name>31st AAAI Conference on Artificial Intelligence</conf-name>, <fpage>4278</fpage>&#x2013;<lpage>4284</lpage>. </citation>
</ref>
<ref id="B37">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Szegedy</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sermanet</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Rabinovich</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Going Deeper with Convolutions</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <fpage>1</fpage>&#x2013;<lpage>9</lpage>. </citation>
</ref>
<ref id="B38">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Szegedy</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Vanhoucke</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Ioffe</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Shlens</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wojna</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Rethinking the Inception Architecture for Computer Vision</article-title>,&#x201d; in <conf-name>IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <fpage>2818</fpage>&#x2013;<lpage>2826</lpage>. <pub-id pub-id-type="doi">10.1109/cvpr.2016.308</pub-id> </citation>
</ref>
<ref id="B39">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Tarasiewicz</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Kawulok</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Nalepa</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Lightweight U-Nets for Brain Tumor Segmentation</article-title>,&#x201d; in <source>Brainlesion: Glioma, Multiple Sclerosis, Stroke and Traumatic Brain Injuries. BrainLes</source>. <source>Lecture Notes in Computer Science</source>. Editors <person-group person-group-type="editor">
<name>
<surname>Crimi,</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bakas</surname>
<given-names>S.</given-names>
</name>
</person-group> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <volume>Vol. 12659</volume>, <fpage>3</fpage>&#x2013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-72087-2_1</pub-id> </citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tschandl</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Rosendahl</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kittler</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>The HAM10000 Dataset, a Large Collection of Multi-Source Dermatoscopic Images of Common Pigmented Skin Lesions</article-title>. <source>Sci. Data</source> <volume>5</volume> (<issue>1</issue>), <fpage>180161</fpage>&#x2013;<lpage>180169</lpage>. <pub-id pub-id-type="doi">10.1038/sdata.2018.161</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Valanarasu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Oza</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Hacihaliloglu</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Patel</surname>
<given-names>V.</given-names>
</name>
</person-group>, &#x201c;<article-title>Medical Transformer: Gated Axial-Attention for Medical Image Segmentation</article-title>,&#x201d; <comment>arXiv:2102.10662</comment>, <year>2021</year>. </citation>
</ref>
<ref id="B41">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wong</surname>
<given-names>K. C. L.</given-names>
</name>
<name>
<surname>Moradi</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>SegNAS3D: Network Architecture Search with Derivative-free Global Optimization for 3D Image Segmentation</article-title>,&#x201d; in <conf-name>22nd International Conference on Medical Image Computing and Computer-Assisted Intervention</conf-name>, <fpage>393</fpage>&#x2013;<lpage>401</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-32248-9_44</pub-id> </citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Sang</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>BiSeNet V2: Bilateral Network with Guided Aggregation for Real-Time Semantic Segmentation</article-title>. <source>Int. J. Comp. Vis.</source>, <fpage>1</fpage>&#x2013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-021-01515-2</pub-id> </citation>
</ref>
<ref id="B43">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Peng</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Sang</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>BiSeNet: Bilateral Segmentation Network for Real-Time Semantic Segmentation</article-title>,&#x201d; in <conf-name>European Conference on Computer Vision</conf-name>, <fpage>334</fpage>&#x2013;<lpage>349</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-01261-8_20</pub-id> </citation>
</ref>
<ref id="B44">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on Computer Vision and Pattern Recognition</conf-name>, <fpage>6848</fpage>&#x2013;<lpage>6856</lpage>. <pub-id pub-id-type="doi">10.1109/cvpr.2018.00716</pub-id> </citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Coleman</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Kerr</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>DENSE-INception U-Net for Medical Image Segmentation</article-title>. <source>Comp. Methods Programs Biomed.</source> <volume>192</volume>, <fpage>105395</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2020.105395</pub-id> </citation>
</ref>
<ref id="B46">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>ICNet for Real-Time Semantic Segmentation on High-Resolution Images</article-title>,&#x201d; in <conf-name>European Conference on Computer Vision</conf-name>, <fpage>418</fpage>&#x2013;<lpage>434</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-01219-9_25</pub-id> </citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zheng</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Rethinking Semantic Segmentation from a Sequence-To-Sequence Perspective with Transformers</article-title>. <source>IEEE Conf. Comp. Vis. Pattern Recognition</source>, <fpage>6881</fpage>&#x2013;<lpage>6890</lpage>. </citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Nie</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Adeli</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lian</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>High-Resolution Encoder-Decoder Networks for Low-Contrast Medical Image Segmentation</article-title>. <source>IEEE Trans. Image Process.</source> <volume>29</volume> (<issue>99</issue>), <fpage>461</fpage>&#x2013;<lpage>475</lpage>. <pub-id pub-id-type="doi">10.1109/TIP.2019.2919937</pub-id> </citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Siddiquee</surname>
<given-names>M. M. R.</given-names>
</name>
<name>
<surname>Tajbakhsh</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>UNet&#x2b;&#x2b;: Redesigning Skip Connections to Exploit Multiscale Features in Image Segmentation</article-title>. <source>IEEE Trans. Med. Imaging</source> <volume>39</volume> (<issue>6</issue>), <fpage>1856</fpage>&#x2013;<lpage>1867</lpage>. <pub-id pub-id-type="doi">10.1109/tmi.2019.2959609</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>